From 439e7271dc2b63de379e37971dc2f64d71e24f8a Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Thu, 31 Aug 2017 16:37:41 -0400
Subject: livepatch: introduce shadow variable API

Add exported API for livepatch modules:

  klp_shadow_get()
  klp_shadow_alloc()
  klp_shadow_get_or_alloc()
  klp_shadow_free()
  klp_shadow_free_all()

that implement "shadow" variables, which allow callers to associate new
shadow fields to existing data structures.  This is intended to be used
by livepatch modules seeking to emulate additions to data structure
definitions.

See Documentation/livepatch/shadow-vars.txt for a summary of the new
shadow variable API, including a few common use cases.

See samples/livepatch/livepatch-shadow-* for example modules that
demonstrate shadow variables.

[jkosina@suse.cz: fix __klp_shadow_get_or_alloc() comment as spotted by
 Josh]
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/Makefile |   2 +-
 kernel/livepatch/shadow.c | 277 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 kernel/livepatch/shadow.c

(limited to 'kernel')

diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..67e4360521f3
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value.  It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer.  Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures.  Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node:	klp_shadow_hash hash table node
+ * @rcu_head:	RCU is used to safely free this structure
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	data area
+ */
+struct klp_shadow {
+	struct hlist_node node;
+	struct rcu_head rcu_head;
+	void *obj;
+	unsigned long id;
+	char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow:	shadow variable to match
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+				unsigned long id)
+{
+	return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+
+	rcu_read_lock();
+
+	hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+				   (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			rcu_read_unlock();
+			return shadow->data;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+	struct klp_shadow *new_shadow;
+	void *shadow_data;
+	unsigned long flags;
+
+	/* Check if the shadow variable already exists */
+	shadow_data = klp_shadow_get(obj, id);
+	if (shadow_data)
+		goto exists;
+
+	/* Allocate a new shadow variable for use inside the lock below */
+	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+	if (!new_shadow)
+		return NULL;
+
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	/* Initialize the shadow variable if data provided */
+	if (data)
+		memcpy(new_shadow->data, data, size);
+
+	/* Look for <obj, id> again under the lock */
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+	shadow_data = klp_shadow_get(obj, id);
+	if (unlikely(shadow_data)) {
+		/*
+		 * Shadow variable was found, throw away speculative
+		 * allocation.
+		 */
+		spin_unlock_irqrestore(&klp_shadow_lock, flags);
+		kfree(new_shadow);
+		goto exists;
+	}
+
+	/* No <obj, id> found, so attach the newly allocated one */
+	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+		     (unsigned long)new_shadow->obj);
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+	return new_shadow->data;
+
+exists:
+	if (warn_on_exist) {
+		WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+		return NULL;
+	}
+
+	return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space.  If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed.  The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present.  Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj.  It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+			       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete <obj, id> from hash */
+	hash_for_each_possible(klp_shadow_hash, shadow, node,
+			       (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id:		data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete all <*, id> from hash */
+	hash_for_each(klp_shadow_hash, i, shadow, node) {
+		if (klp_shadow_match(shadow, shadow->obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
-- 
cgit v1.3-14-g43fede


From 5d9da759f7587c87252ef98e70bc0b4a89e4d036 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 14 Sep 2017 14:15:36 -0700
Subject: livepatch: __klp_shadow_get_or_alloc() is local to shadow.c

... therefore make it static.

Fixes: 439e7271dc2 ("livepatch: introduce shadow variable API")
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/shadow.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
index 67e4360521f3..fdac27588d60 100644
--- a/kernel/livepatch/shadow.c
+++ b/kernel/livepatch/shadow.c
@@ -113,7 +113,7 @@ void *klp_shadow_get(void *obj, unsigned long id)
 }
 EXPORT_SYMBOL_GPL(klp_shadow_get);
 
-void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
 		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
 {
 	struct klp_shadow *new_shadow;
-- 
cgit v1.3-14-g43fede


From e454cf5958538666635488030046b6a84a22d447 Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Mon, 18 Sep 2017 15:30:55 -0400
Subject: bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE

This is a simple non-recursive delete operation.  It prunes paths
of empty nodes in the tree, but it does not try to further compress
the tree as nodes are removed.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 77 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..9d58a576b2ae 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,84 @@ out:
 	return ret;
 }
 
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
-	/* TODO */
-	return -ENOSYS;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie_node __rcu **trim;
+	struct lpm_trie_node *node;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Walk the tree looking for an exact key/length match and keeping
+	 * track of where we could begin trimming the tree.  The trim-point
+	 * is the sub-tree along the walk consisting of only single-child
+	 * intermediate nodes and ending at a leaf node that we want to
+	 * remove.
+	 */
+	trim = &trie->root;
+	node = rcu_dereference_protected(
+		trie->root, lockdep_is_held(&trie->lock));
+	while (node) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		next_bit = extract_bit(key->data, node->prefixlen);
+		/* If we hit a node that has more than one child or is a valid
+		 * prefix itself, do not remove it. Reset the root of the trim
+		 * path to its descendant on our path.
+		 */
+		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
+		    (node->child[0] && node->child[1]))
+			trim = &node->child[next_bit];
+		node = rcu_dereference_protected(
+			node->child[next_bit], lockdep_is_held(&trie->lock));
+	}
+
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	trie->n_entries--;
+
+	/* If the node we are removing is not a leaf node, simply mark it
+	 * as intermediate and we are done.
+	 */
+	if (rcu_access_pointer(node->child[0]) ||
+	    rcu_access_pointer(node->child[1])) {
+		node->flags |= LPM_TREE_NODE_FLAG_IM;
+		goto out;
+	}
+
+	/* trim should now point to the slot holding the start of a path from
+	 * zero or more intermediate nodes to our leaf node for deletion.
+	 */
+	while ((node = rcu_dereference_protected(
+			*trim, lockdep_is_held(&trie->lock)))) {
+		RCU_INIT_POINTER(*trim, NULL);
+		trim = rcu_access_pointer(node->child[0]) ?
+			&node->child[0] :
+			&node->child[1];
+		kfree_rcu(node, rcu);
+	}
+
+out:
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
 }
 
 #define LPM_DATA_SIZE_MAX	256
-- 
cgit v1.3-14-g43fede


From f454322efbf6faee695f517c6b52c4dc03cacd3e Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:11 +0300
Subject: signal: replace sigset_to_compat() with put_compat_sigset()

There are 4 callers of sigset_to_compat() in the entire kernel.  One is
in sparc compat rt_sigaction(2), the rest are in kernel/signal.c itself.
All are followed by copy_to_user(), and all but the sparc one are under
"if it's big-endian..." ifdefs.

Let's transform sigset_to_compat() into put_compat_sigset() that also
calls copy_to_user().

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  6 +++---
 include/linux/compat.h          |  3 ++-
 kernel/compat.c                 | 20 ++++++++++++++------
 kernel/signal.c                 | 27 ++++++---------------------
 4 files changed, 25 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index bca44f3e6b86..5e2bec9e41b2 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -159,7 +159,6 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 {
         struct k_sigaction new_ka, old_ka;
         int ret;
-	compat_sigset_t set32;
 
         /* XXX: Don't preclude handling different sized sigset_t's.  */
         if (sigsetsize != sizeof(compat_sigset_t))
@@ -167,6 +166,7 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
+		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
@@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 
 	if (!ret && oact) {
-		sigset_to_compat(&set32, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
 		if (ret)
diff --git a/include/linux/compat.h b/include/linux/compat.h
index a5619de3437d..ab1baa79abe8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -456,7 +456,8 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
-extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
+extern int put_compat_sigset(compat_sigset_t __user *compat,
+			     const sigset_t *set, unsigned int size);
 
 asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 		compat_ulong_t maxnode, const compat_ulong_t __user *old_nodes,
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..18dd902c9052 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -497,15 +497,23 @@ sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
 }
 EXPORT_SYMBOL_GPL(sigset_from_compat);
 
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
 {
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
 	switch (_NSIG_WORDS) {
-	case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
-	case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
-	case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
-	case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
 	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
 }
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/signal.c b/kernel/signal.c
index 800a18f77732..14ad6bb90dad 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2621,13 +2621,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		if (error)
 			return error;
 	}
-	if (oset) {
-		compat_sigset_t old32;
-		sigset_to_compat(&old32, &old_set);
-		if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
-			return -EFAULT;
-	}
-	return 0;
+	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
 #else
 	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
 				  (sigset_t __user *)oset, sigsetsize);
@@ -2669,20 +2663,11 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
 	int err = do_sigpending(&set, sigsetsize);
-	if (!err) {
-		compat_sigset_t set32;
-		sigset_to_compat(&set32, &set);
-		/* we can get here only if sigsetsize <= sizeof(set) */
-		if (copy_to_user(uset, &set32, sigsetsize))
-			err = -EFAULT;
-	}
+	if (!err)
+		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
 }
 #endif
 
@@ -3451,7 +3436,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		compat_size_t, sigsetsize)
 {
 	struct k_sigaction new_ka, old_ka;
-	compat_sigset_t mask;
 #ifdef __ARCH_HAS_SA_RESTORER
 	compat_uptr_t restorer;
 #endif
@@ -3463,6 +3447,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
+		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
@@ -3478,10 +3463,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 	if (!ret && oact) {
-		sigset_to_compat(&mask, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
-- 
cgit v1.3-14-g43fede


From 1681634b8c70353d8ca211b9b3443889a16dafeb Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:29 +0300
Subject: signal: simplify compat_sigpending()

Remove "if it's big-endian..." ifdef in compat_sigpending(),
use the endian-agnostic variant.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 14ad6bb90dad..f59c05fc374a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3330,15 +3330,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
 	int err = do_sigpending(&set, sizeof(set.sig[0]));
 	if (!err)
 		err = put_user(set.sig[0], set32);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
 }
 #endif
 
-- 
cgit v1.3-14-g43fede


From 176826af03666758c656dd27f098cc889b71638b Mon Sep 17 00:00:00 2001
From: "Dmitry V. Levin" <ldv@altlinux.org>
Date: Tue, 22 Aug 2017 02:16:43 +0300
Subject: signal: lift sigset size check out of do_sigpending()

As sigsetsize argument of do_sigpending() is not used anywhere else in
that function after the check, remove this argument and move the check
out of do_sigpending() into rt_sigpending() and its compat analog.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/signal.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f59c05fc374a..9fbc574ced10 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2629,11 +2629,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 }
 #endif
 
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
 {
-	if (sigsetsize > sizeof(sigset_t))
-		return -EINVAL;
-
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(set, &current->pending.signal,
 		  &current->signal->shared_pending.signal);
@@ -2653,7 +2650,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
 	if (!err && copy_to_user(uset, &set, sigsetsize))
 		err = -EFAULT;
 	return err;
@@ -2664,7 +2666,12 @@ COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
 	if (!err)
 		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
@@ -3331,7 +3338,7 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sizeof(set.sig[0]));
+	int err = do_sigpending(&set);
 	if (!err)
 		err = put_user(set.sig[0], set32);
 	return err;
-- 
cgit v1.3-14-g43fede


From b8e8e1aa9f14110da180569908bbe538c9e9dc63 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 20:42:54 -0400
Subject: get rid of {get,put}_compat_itimerspec()

no users left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h |  5 -----
 kernel/compat.c        | 18 ------------------
 2 files changed, 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ab1baa79abe8..21d30be5c0a5 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -443,11 +443,6 @@ static inline int compat_timespec_compare(struct compat_timespec *lhs,
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-extern int get_compat_itimerspec(struct itimerspec *dst,
-				 const struct compat_itimerspec __user *src);
-extern int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-				 const struct itimerspec *src);
-
 asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz);
 asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
diff --git a/kernel/compat.c b/kernel/compat.c
index 18dd902c9052..d43b18031116 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
-int get_compat_itimerspec(struct itimerspec *dst,
-			  const struct compat_itimerspec __user *src)
-{
-	if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
-	    __compat_get_timespec(&dst->it_value, &src->it_value))
-		return -EFAULT;
-	return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-			  const struct itimerspec *src)
-{
-	if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
-	    __compat_put_timespec(&src->it_value, &dst->it_value))
-		return -EFAULT;
-	return 0;
-}
-
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
-- 
cgit v1.3-14-g43fede


From 3968cf623892d710e651070243fd16af312a9797 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Sep 2017 21:45:17 -0400
Subject: get_compat_sigset()

similar to put_compat_sigset()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc32.c |  4 +---
 fs/eventpoll.c                  |  4 +---
 fs/select.c                     |  8 ++------
 fs/signalfd.c                   |  4 +---
 include/linux/compat.h          |  2 +-
 kernel/compat.c                 | 23 ++++++++++++++++-------
 kernel/signal.c                 | 27 ++++-----------------------
 virt/kvm/kvm_main.c             |  7 ++-----
 8 files changed, 28 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 5e2bec9e41b2..34ece61ee970 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -166,13 +166,11 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 
         if (act) {
 		u32 u_handler, u_restorer;
-		compat_sigset_t set32;
 
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
 		new_ka.sa.sa_handler =  compat_ptr(u_handler);
-		ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
-		sigset_from_compat(&new_ka.sa.sa_mask, &set32);
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		ret |= get_user(u_restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(u_restorer);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..396a3c075fd4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2259,7 +2259,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 			compat_size_t, sigsetsize)
 {
 	long err;
-	compat_sigset_t csigmask;
 	sigset_t ksigmask, sigsaved;
 
 	/*
@@ -2269,9 +2268,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
 		sigsaved = current->blocked;
 		set_current_blocked(&ksigmask);
 	}
diff --git a/fs/select.c b/fs/select.c
index 20a7d061904f..9c980162c9fe 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -1301,7 +1301,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
 	compat_size_t sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1318,9 +1317,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
@@ -1369,7 +1367,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	unsigned int,  nfds, struct compat_timespec __user *, tsp,
 	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
 {
-	compat_sigset_t ss32;
 	sigset_t ksigmask, sigsaved;
 	struct timespec64 ts, end_time, *to = NULL;
 	int ret;
@@ -1386,9 +1383,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
 	if (sigmask) {
 		if (sigsetsize != sizeof(compat_sigset_t))
 			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+		if (get_compat_sigset(&ksigmask, sigmask))
 			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
 
 		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
 		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d2c434112f42..9de5beeb771d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -312,15 +312,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd,
 		     compat_size_t, sigsetsize,
 		     int, flags)
 {
-	compat_sigset_t ss32;
 	sigset_t tmp;
 	sigset_t __user *ksigmask;
 
 	if (sigsetsize != sizeof(compat_sigset_t))
 		return -EINVAL;
-	if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+	if (get_compat_sigset(&tmp, sigmask))
 		return -EFAULT;
-	sigset_from_compat(&tmp, &ss32);
 	ksigmask = compat_alloc_user_space(sizeof(sigset_t));
 	if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t)))
 		return -EFAULT;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 21d30be5c0a5..57cb6ecafa86 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -450,7 +450,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
-extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
+extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat);
 extern int put_compat_sigset(compat_sigset_t __user *compat,
 			     const sigset_t *set, unsigned int size);
 
diff --git a/kernel/compat.c b/kernel/compat.c
index d43b18031116..a46a4a40bb8b 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -467,17 +467,26 @@ Efault:
 	return -EFAULT;
 }
 
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 {
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
 	switch (_NSIG_WORDS) {
-	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
-	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
-	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
-	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
 	}
+#else
+	if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
 
 int
 put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
diff --git a/kernel/signal.c b/kernel/signal.c
index 9fbc574ced10..36a523640894 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2600,7 +2600,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
 COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t old_set = current->blocked;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
@@ -2608,13 +2607,10 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		return -EINVAL;
 
 	if (nset) {
-		compat_sigset_t new32;
 		sigset_t new_set;
 		int error;
-		if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+		if (get_compat_sigset(&new_set, nset))
 			return -EFAULT;
-
-		sigset_from_compat(&new_set, &new32);
 		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
 
 		error = sigprocmask(how, &new_set, NULL);
@@ -2622,10 +2618,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 			return error;
 	}
 	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
-#else
-	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
-				  (sigset_t __user *)oset, sigsetsize);
-#endif
 }
 #endif
 
@@ -2908,7 +2900,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
-	compat_sigset_t s32;
 	sigset_t s;
 	struct timespec t;
 	siginfo_t info;
@@ -2917,9 +2908,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&s, uthese))
 		return -EFAULT;
-	sigset_from_compat(&s, &s32);
 
 	if (uts) {
 		if (compat_get_timespec(&t, uts))
@@ -3450,18 +3440,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 
 	if (act) {
 		compat_uptr_t handler;
-		compat_sigset_t mask;
 		ret = get_user(handler, &act->sa_handler);
 		new_ka.sa.sa_handler = compat_ptr(handler);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= get_user(restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
-		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		if (ret)
 			return -EFAULT;
-		sigset_from_compat(&new_ka.sa.sa_mask, &mask);
 	}
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
@@ -3649,22 +3637,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t newset;
-	compat_sigset_t newset32;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&newset, unewset))
 		return -EFAULT;
-	sigset_from_compat(&newset, &newset32);
 	return sigsuspend(&newset);
-#else
-	/* on little-endian bitmaps don't care about granularity */
-	return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
 }
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..99bfe50a0589 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2724,7 +2724,6 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 	case KVM_SET_SIGNAL_MASK: {
 		struct kvm_signal_mask __user *sigmask_arg = argp;
 		struct kvm_signal_mask kvm_sigmask;
-		compat_sigset_t csigset;
 		sigset_t sigset;
 
 		if (argp) {
@@ -2733,13 +2732,11 @@ static long kvm_vcpu_compat_ioctl(struct file *filp,
 					   sizeof(kvm_sigmask)))
 				goto out;
 			r = -EINVAL;
-			if (kvm_sigmask.len != sizeof(csigset))
+			if (kvm_sigmask.len != sizeof(compat_sigset_t))
 				goto out;
 			r = -EFAULT;
-			if (copy_from_user(&csigset, sigmask_arg->sigset,
-					   sizeof(csigset)))
+			if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset))
 				goto out;
-			sigset_from_compat(&sigset, &csigset);
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
 		} else
 			r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
-- 
cgit v1.3-14-g43fede


From abca5fc535a3ee0f36fb6d4468a453eaae769921 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Sep 2017 18:17:46 -0400
Subject: sched_rr_get_interval(): move compat to native, get rid of set_fs()

switch to using timespec64 internally, while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/compat.c     | 16 ----------------
 kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index a46a4a40bb8b..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -562,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
 }
 #endif
 
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
-		       compat_pid_t, pid,
-		       struct compat_timespec __user *, interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-	set_fs(old_fs);
-	if (compat_put_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..e74f0a5a8647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
 
 #include <linux/blkdev.h>
 #include <linux/kprobes.h>
@@ -5098,13 +5099,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
  * an error code.
  */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-		struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	struct rq_flags rf;
-	struct timespec t;
 	struct rq *rq;
 	int retval;
 
@@ -5128,15 +5127,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	task_rq_unlock(rq, p, &rf);
 
 	rcu_read_unlock();
-	jiffies_to_timespec(time_slice, &t);
-	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-	return retval;
+	jiffies_to_timespec64(time_slice, t);
+	return 0;
 
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = put_timespec64(&t, interval);
+
+	return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+		       compat_pid_t, pid,
+		       struct compat_timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = compat_put_timespec64(&t, interval);
+	return retval;
+}
+#endif
+
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
-- 
cgit v1.3-14-g43fede


From cfb766da54d98ceb145e1bb0bd11c559569dcbfc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: sched/cputime: Expose cputime_adjust()

Will be used by basic cgroup resource stat reporting later.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/sched/cputime.h | 3 ++-
 kernel/sched/cputime.c        | 5 ++---
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 4c5b9735c1ae..9251044335c5 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -53,7 +53,8 @@ static inline void task_cputime_scaled(struct task_struct *t,
 
 extern void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
 extern void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st);
-
+extern void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+			   u64 *ut, u64 *st);
 
 /*
  * Thread group CPU time accounting.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..8839b6e8a104 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -585,9 +585,8 @@ drop_precision:
  *
  * Assuming that rtime_i+1 >= rtime_i.
  */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
 {
 	u64 rtime, stime, utime;
 	unsigned long flags;
-- 
cgit v1.3-14-g43fede


From d2cc5ed6949085cfba30ec5228816cf6eb1d02b9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:04 -0700
Subject: cpuacct: Introduce cgroup_account_cputime[_field]()

Introduce cgroup_account_cputime[_field]() which wrap cpuacct_charge()
and cgroup_account_field().  This doesn't introduce any functional
changes and will be used to add cgroup basic resource accounting.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
---
 include/linux/cgroup.h   | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/sched/cpuacct.h   | 17 -----------------
 kernel/sched/cputime.c   |  2 +-
 kernel/sched/deadline.c  |  2 +-
 kernel/sched/fair.c      |  2 +-
 kernel/sched/rt.c        |  2 +-
 kernel/sched/sched.h     |  2 +-
 kernel/sched/stop_task.c |  2 +-
 8 files changed, 44 insertions(+), 23 deletions(-)
 delete mode 100644 kernel/sched/cpuacct.h

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d023ac5e377f..6cd579329310 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,7 @@
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
 #include <linux/refcount.h>
+#include <linux/kernel_stat.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -688,6 +689,43 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
 	char *buf, size_t buflen) {}
 #endif /* !CONFIG_CGROUPS */
 
+/*
+ * Basic resource stats.
+ */
+#ifdef CONFIG_CGROUPS
+
+#ifdef CONFIG_CGROUP_CPUACCT
+void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+static inline void cpuacct_account_field(struct task_struct *tsk, int index,
+					 u64 val) {}
+#endif
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec)
+{
+	cpuacct_charge(task, delta_exec);
+}
+
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec)
+{
+	cpuacct_account_field(task, index, delta_exec);
+}
+
+#else	/* CONFIG_CGROUPS */
+
+static inline void cgroup_account_cputime(struct task_struct *task,
+					  u64 delta_exec) {}
+static inline void cgroup_account_cputime_field(struct task_struct *task,
+						enum cpu_usage_stat index,
+						u64 delta_exec) {}
+
+#endif	/* CONFIG_CGROUPS */
+
 /*
  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
  * definition in cgroup-defs.h.
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index ba72807c73d4..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8839b6e8a104..e01b699bbd5b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 	 */
 	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-	cpuacct_account_field(p, index, tmp);
+	cgroup_account_cputime_field(p, index, tmp);
 }
 
 /*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7667c3..abd913c1b99e 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1143,7 +1143,7 @@ static void update_curr_dl(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..0ae69af95b8b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cpuacct_charge(curtask, delta_exec);
+		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0af5ca9e3e3f..fdc2c5d1f82e 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -979,7 +979,7 @@ static void update_curr_rt(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76cd496f..f0b98f978843 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -29,6 +29,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cgroup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -36,7 +37,6 @@
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-#include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 9f69fb630853..ec0bb5ab9024 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -71,7 +71,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
-- 
cgit v1.3-14-g43fede


From 041cd640b2f3c5607171c59d8712b503659d21f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 08:12:05 -0700
Subject: cgroup: Implement cgroup2 basic CPU usage accounting

In cgroup1, while cpuacct isn't actually controlling any resources, it
is a separate controller due to combination of two factors -
1. enabling cpu controller has significant side effects, and 2. we
have to pick one of the hierarchies to account CPU usages on.  cpuacct
controller is effectively used to designate a hierarchy to track CPU
usages on.

cgroup2's unified hierarchy removes the second reason and we can
account basic CPU usages by default.  While we can use cpuacct for
this purpose, both its interface and implementation leave a lot to be
desired - it collects and exposes two sources of truth which don't
agree with each other and some of the exposed statistics don't make
much sense.  Also, it propagates all the way up the hierarchy on each
accounting event which is unnecessary.

This patch adds basic resource accounting mechanism to cgroup2's
unified hierarchy and accounts CPU usages using it.

* All accountings are done per-cpu and don't propagate immediately.
  It just bumps the per-cgroup per-cpu counters and links to the
  parent's updated list if not already on it.

* On a read, the per-cpu counters are collected into the global ones
  and then propagated upwards.  Only the per-cpu counters which have
  changed since the last read are propagated.

* CPU usage stats are collected and shown in "cgroup.stat" with "cpu."
  prefix.  Total usage is collected from scheduling events.  User/sys
  breakdown is sourced from tick sampling and adjusted to the usage
  using cputime_adjust().

This keeps the accounting side hot path O(1) and per-cpu and the read
side O(nr_updated_since_last_read).

v2: Minor changes and documentation updates as suggested by Waiman and
    Roman.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Roman Gushchin <guro@fb.com>
---
 Documentation/cgroup-v2.txt     |   9 ++
 include/linux/cgroup-defs.h     |  57 +++++++
 include/linux/cgroup.h          |  22 +++
 kernel/cgroup/Makefile          |   2 +-
 kernel/cgroup/cgroup-internal.h |   8 +
 kernel/cgroup/cgroup.c          |  24 ++-
 kernel/cgroup/stat.c            | 334 ++++++++++++++++++++++++++++++++++++++++
 7 files changed, 453 insertions(+), 3 deletions(-)
 create mode 100644 kernel/cgroup/stat.c

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index dc44785dc0fa..3f8216912df0 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,6 +886,15 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
+	  cpu.usage_usec
+		CPU time consumed in the subtree.
+
+	  cpu.user_usec
+		User CPU time consumed in the subtree.
+
+	  cpu.system_usec
+		System CPU time consumed in the subtree.
+
 
 Controllers
 ===========
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index ade4a78a54c2..3e55bbd31ad1 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup.h>
 
@@ -254,6 +255,57 @@ struct css_set {
 	struct rcu_head rcu_head;
 };
 
+/*
+ * cgroup basic resource usage statistics.  Accounting is done per-cpu in
+ * cgroup_cpu_stat which is then lazily propagated up the hierarchy on
+ * reads.
+ *
+ * When a stat gets updated, the cgroup_cpu_stat and its ancestors are
+ * linked into the updated tree.  On the following read, propagation only
+ * considers and consumes the updated tree.  This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently.  The combination can
+ * become very expensive.  By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ */
+struct cgroup_cpu_stat {
+	/*
+	 * ->sync protects all the current counters.  These are the only
+	 * fields which get updated in the hot path.
+	 */
+	struct u64_stats_sync sync;
+	struct task_cputime cputime;
+
+	/*
+	 * Snapshots at the last reading.  These are used to calculate the
+	 * deltas to propagate to the global counters.
+	 */
+	struct task_cputime last_cputime;
+
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next.
+	 *
+	 * In addition to being more compact, singly-linked list pointing
+	 * to the cgroup makes it unnecessary for each per-cpu struct to
+	 * point back to the associated cgroup.
+	 *
+	 * Protected by per-cpu cgroup_cpu_stat_lock.
+	 */
+	struct cgroup *updated_children;	/* terminated by self cgroup */
+	struct cgroup *updated_next;		/* NULL iff not on the list */
+};
+
+struct cgroup_stat {
+	/* per-cpu statistics are collected into the folowing global counters */
+	struct task_cputime cputime;
+	struct prev_cputime prev_cputime;
+};
+
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
@@ -353,6 +405,11 @@ struct cgroup {
 	 */
 	struct cgroup *dom_cgrp;
 
+	/* cgroup basic resource statistics */
+	struct cgroup_cpu_stat __percpu *cpu_stat;
+	struct cgroup_stat pending_stat;	/* pending from children */
+	struct cgroup_stat stat;
+
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6cd579329310..328a70ce0e23 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,17 +703,39 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec);
+
 static inline void cgroup_account_cputime(struct task_struct *task,
 					  u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_charge(task, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime(cgrp, delta_exec);
+	rcu_read_unlock();
 }
 
 static inline void cgroup_account_cputime_field(struct task_struct *task,
 						enum cpu_usage_stat index,
 						u64 delta_exec)
 {
+	struct cgroup *cgrp;
+
 	cpuacct_account_field(task, index, delta_exec);
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(task);
+	if (cgroup_parent(cgrp))
+		__cgroup_account_cputime_field(cgrp, index, delta_exec);
+	rcu_read_unlock();
 }
 
 #else	/* CONFIG_CGROUPS */
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ce693ccb8c58..0acee616e06c 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,4 +1,4 @@
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 5151ff256c29..fa642c99586a 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -199,6 +199,14 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
+/*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_boot(void);
+
 /*
  * namespace.c
  */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..d036625556c9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -3301,6 +3303,8 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
+	cgroup_stat_show_cputime(seq, "cpu.");
+
 	return 0;
 }
 
@@ -4471,6 +4475,8 @@ static void css_free_work_fn(struct work_struct *work)
 			 */
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
+			if (cgroup_on_dfl(cgrp))
+				cgroup_stat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4515,6 +4521,9 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
+		if (cgroup_on_dfl(cgrp))
+			cgroup_stat_flush(cgrp);
+
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
 			tcgrp->nr_dying_descendants--;
@@ -4698,6 +4707,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_free_cgrp;
 
+	if (cgroup_on_dfl(parent)) {
+		ret = cgroup_stat_init(cgrp);
+		if (ret)
+			goto out_cancel_ref;
+	}
+
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
@@ -4705,7 +4720,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
-		goto out_cancel_ref;
+		goto out_stat_exit;
 	}
 
 	init_cgroup_housekeeping(cgrp);
@@ -4754,6 +4769,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 
 	return cgrp;
 
+out_stat_exit:
+	if (cgroup_on_dfl(parent))
+		cgroup_stat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5148,6 +5166,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
+	cgroup_stat_boot();
+
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
 	 * avoid it at the cost of forcing all readers into the slow path.
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..9cce79e89320
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+	struct cgroup *parent;
+	unsigned long flags;
+
+	/*
+	 * Speculative already-on-list test.  This may race leading to
+	 * temporary inaccuracies, which is fine.
+	 *
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @cgrp is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+		return;
+
+	raw_spin_lock_irqsave(cpu_lock, flags);
+
+	/* put @cgrp and all ancestors on the corresponding updated lists */
+	for (parent = cgroup_parent(cgrp); parent;
+	     cgrp = parent, parent = cgroup_parent(cgrp)) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+		/*
+		 * Both additions and removals are bottom-up.  If a cgroup
+		 * is already in the tree, all ancestors are.
+		 */
+		if (cstat->updated_next)
+			break;
+
+		cstat->updated_next = pcstat->updated_children;
+		pcstat->updated_children = cgrp;
+	}
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+						  struct cgroup *root, int cpu)
+{
+	struct cgroup_cpu_stat *cstat;
+	struct cgroup *parent;
+
+	if (pos == root)
+		return NULL;
+
+	/*
+	 * We're gonna walk down to the first leaf and visit/remove it.  We
+	 * can pick whatever unvisited node as the starting point.
+	 */
+	if (!pos)
+		pos = root;
+	else
+		pos = cgroup_parent(pos);
+
+	/* walk down to the first leaf */
+	while (true) {
+		cstat = cgroup_cpu_stat(pos, cpu);
+		if (cstat->updated_children == pos)
+			break;
+		pos = cstat->updated_children;
+	}
+
+	/*
+	 * Unlink @pos from the tree.  As the updated_children list is
+	 * singly linked, we have to walk it to find the removal point.
+	 * However, due to the way we traverse, @pos will be the first
+	 * child in most cases. The only exception is @root.
+	 */
+	parent = cgroup_parent(pos);
+	if (parent && cstat->updated_next) {
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+		struct cgroup_cpu_stat *ncstat;
+		struct cgroup **nextp;
+
+		nextp = &pcstat->updated_children;
+		while (true) {
+			ncstat = cgroup_cpu_stat(*nextp, cpu);
+			if (*nextp == pos)
+				break;
+
+			WARN_ON_ONCE(*nextp == parent);
+			nextp = &ncstat->updated_next;
+		}
+
+		*nextp = cstat->updated_next;
+		cstat->updated_next = NULL;
+	}
+
+	return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+				   struct cgroup_stat *src_stat)
+{
+	dst_stat->cputime.utime += src_stat->cputime.utime;
+	dst_stat->cputime.stime += src_stat->cputime.stime;
+	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+	struct task_cputime *last_cputime = &cstat->last_cputime;
+	struct task_cputime cputime;
+	struct cgroup_stat delta;
+	unsigned seq;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	/* fetch the current per-cpu values */
+	do {
+		seq = __u64_stats_fetch_begin(&cstat->sync);
+		cputime = cstat->cputime;
+	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+	/* accumulate the deltas to propgate */
+	delta.cputime.utime = cputime.utime - last_cputime->utime;
+	delta.cputime.stime = cputime.stime - last_cputime->stime;
+	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+					 last_cputime->sum_exec_runtime;
+	*last_cputime = cputime;
+
+	/* transfer the pending stat into delta */
+	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+	/* propagate delta into the global stat and the parent's pending */
+	cgroup_stat_accumulate(&cgrp->stat, &delta);
+	if (parent)
+		cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+	int cpu;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	for_each_possible_cpu(cpu) {
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+		struct cgroup *pos = NULL;
+
+		raw_spin_lock_irq(cpu_lock);
+		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+			cgroup_cpu_stat_flush_one(pos, cpu);
+		raw_spin_unlock_irq(cpu_lock);
+	}
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_stat_mutex);
+	cgroup_stat_flush_locked(cgrp);
+	mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = get_cpu_ptr(cgrp->cpu_stat);
+	u64_stats_update_begin(&cstat->sync);
+	return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+					struct cgroup_cpu_stat *cstat)
+{
+	u64_stats_update_end(&cstat->sync);
+	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+	cstat->cputime.sum_exec_runtime += delta_exec;
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+	switch (index) {
+	case CPUTIME_USER:
+	case CPUTIME_NICE:
+		cstat->cputime.utime += delta_exec;
+		break;
+	case CPUTIME_SYSTEM:
+	case CPUTIME_IRQ:
+	case CPUTIME_SOFTIRQ:
+		cstat->cputime.stime += delta_exec;
+		break;
+	default:
+		break;
+	}
+
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 usage, utime, stime;
+
+	if (!cgroup_parent(cgrp))
+		return;
+
+	mutex_lock(&cgroup_stat_mutex);
+
+	cgroup_stat_flush_locked(cgrp);
+
+	usage = cgrp->stat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+		       &utime, &stime);
+
+	mutex_unlock(&cgroup_stat_mutex);
+
+	do_div(usage, NSEC_PER_USEC);
+	do_div(utime, NSEC_PER_USEC);
+	do_div(stime, NSEC_PER_USEC);
+
+	seq_printf(seq, "%susage_usec %llu\n"
+		   "%suser_usec %llu\n"
+		   "%ssystem_usec %llu\n",
+		   prefix, usage, prefix, utime, prefix, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+	int cpu;
+
+	/* the root cgrp has cpu_stat preallocated */
+	if (!cgrp->cpu_stat) {
+		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+		if (!cgrp->cpu_stat)
+			return -ENOMEM;
+	}
+
+	/* ->updated_children list is self terminated */
+	for_each_possible_cpu(cpu)
+		cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+	prev_cputime_init(&cgrp->stat.prev_cputime);
+
+	return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+	int cpu;
+
+	cgroup_stat_flush(cgrp);
+
+	/* sanity check */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+		    WARN_ON_ONCE(cstat->updated_next))
+			return;
+	}
+
+	free_percpu(cgrp->cpu_stat);
+	cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
-- 
cgit v1.3-14-g43fede


From e0b477941d130576d9daf31160dab6e34335b10a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:04 +0200
Subject: genirq/debugfs: Show debug information for all irq descriptors

Currently the debugfs shows only information about actively used interrupts
like /proc/irq/ does. That's fine for most cases, but not helpful when
internals of allocated, but unused interrupt descriptors have to
debugged. It's also useful to provide information about all descriptors so
leaks can be debugged in a simpler way.

Move the debugfs registration to the descriptor allocation code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.355525908@linutronix.de
---
 kernel/irq/irqdesc.c | 1 +
 kernel/irq/manage.c  | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7ed369f..0f6805b75fc8 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -462,6 +462,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 			goto err;
 		irq_insert_desc(start + i, desc);
 		irq_sysfs_add(start + i, desc);
+		irq_add_debugfs_entry(start + i, desc);
 	}
 	bitmap_set(allocated_irqs, start, cnt);
 	return start;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d00132b5c325..0e8b48315f3c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1400,7 +1400,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		wake_up_process(new->secondary->thread);
 
 	register_irq_proc(irq, desc);
-	irq_add_debugfs_entry(irq, desc);
 	new->dir = NULL;
 	register_handler_proc(irq, new);
 	return 0;
-- 
cgit v1.3-14-g43fede


From 07557ccb8c83f315e409ddee181e9ffbf87c6ad1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:05 +0200
Subject: genirq/msi: Capture device name for debugfs

For debugging the allocation of unused or potentially leaked interrupt
descriptor it's helpful to have some information about the site which
allocated them. In case of MSI this is simple because the caller hands the
device struct pointer into the domain allocation function.

Duplicate the device name and show it in the debugfs entry of the interrupt
descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.433038426@linutronix.de
---
 include/linux/irqdesc.h |  1 +
 kernel/irq/debugfs.c    | 10 ++++++++++
 kernel/irq/internals.h  |  5 +++++
 kernel/irq/msi.c        |  6 +++++-
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 3e90a094798d..b55b113c049b 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -93,6 +93,7 @@ struct irq_desc {
 #endif
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 	struct dentry		*debugfs_file;
+	const char		*dev_name;
 #endif
 #ifdef CONFIG_SPARSE_IRQ
 	struct rcu_head		rcu;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c3fdb36dec30..b7d1023b9b22 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -149,6 +149,7 @@ static int irq_debug_show(struct seq_file *m, void *p)
 	raw_spin_lock_irq(&desc->lock);
 	data = irq_desc_get_irq_data(desc);
 	seq_printf(m, "handler:  %pf\n", desc->handle_irq);
+	seq_printf(m, "device:   %s\n", desc->dev_name);
 	seq_printf(m, "status:   0x%08x\n", desc->status_use_accessors);
 	irq_debug_show_bits(m, 0, desc->status_use_accessors, irqdesc_states,
 			    ARRAY_SIZE(irqdesc_states));
@@ -226,6 +227,15 @@ static const struct file_operations dfs_irq_ops = {
 	.release	= single_release,
 };
 
+void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	const char *name = dev_name(dev);
+
+	if (name)
+		desc->dev_name = kstrdup(name, GFP_KERNEL);
+}
+
 void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc)
 {
 	char name [10];
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a4aa39009f0d..cfaec2669093 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -443,7 +443,9 @@ void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
 static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
 {
 	debugfs_remove(desc->debugfs_file);
+	kfree(desc->dev_name);
 }
+void irq_debugfs_copy_devname(int irq, struct device *dev);
 # ifdef CONFIG_IRQ_DOMAIN
 void irq_domain_debugfs_init(struct dentry *root);
 # else
@@ -458,4 +460,7 @@ static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
 static inline void irq_remove_debugfs_entry(struct irq_desc *d)
 {
 }
+static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
+{
+}
 #endif /* CONFIG_GENERIC_IRQ_DEBUGFS */
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 3fa4bd59f569..94ecc0293844 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -16,6 +16,8 @@
 #include <linux/msi.h>
 #include <linux/slab.h>
 
+#include "internals.h"
+
 /**
  * alloc_msi_entry - Allocate an initialize msi_entry
  * @dev:	Pointer to the device for which this is allocated
@@ -373,8 +375,10 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			return ret;
 		}
 
-		for (i = 0; i < desc->nvec_used; i++)
+		for (i = 0; i < desc->nvec_used; i++) {
 			irq_set_msi_desc_off(virq, i, desc);
+			irq_debugfs_copy_devname(virq + i, dev);
+		}
 	}
 
 	if (ops->msi_finish)
-- 
cgit v1.3-14-g43fede


From c3e7239a7f43ce1ff407df5f5944bf0d42dc21bf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:06 +0200
Subject: irqdomain/debugfs: Provide domain specific debug callback

Some interrupt domains like the X86 vector domain has special requirements
for debugging, like showing the vector usage on the CPUs.

Add a callback to the irqdomain ops which can be filled in by domains which
require it and add conditional invocations to the irqdomain and the per irq
debug files.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.512937505@linutronix.de
---
 include/linux/irqdomain.h | 6 +++++-
 kernel/irq/debugfs.c      | 2 ++
 kernel/irq/irqdomain.c    | 2 ++
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 81e4889ca6dd..1fb50438a868 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -40,6 +40,7 @@ struct of_device_id;
 struct irq_chip;
 struct irq_data;
 struct cpumask;
+struct seq_file;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -104,7 +105,6 @@ struct irq_domain_ops {
 	int (*xlate)(struct irq_domain *d, struct device_node *node,
 		     const u32 *intspec, unsigned int intsize,
 		     unsigned long *out_hwirq, unsigned int *out_type);
-
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	/* extended V2 interfaces to support hierarchy irq_domains */
 	int (*alloc)(struct irq_domain *d, unsigned int virq,
@@ -116,6 +116,10 @@ struct irq_domain_ops {
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
 #endif
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+	void (*debug_show)(struct seq_file *m, struct irq_domain *d,
+			   struct irq_data *irqd, int ind);
+#endif
 };
 
 extern struct irq_domain_ops irq_generic_chip_ops;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index b7d1023b9b22..7f608ac39653 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -81,6 +81,8 @@ irq_debug_show_data(struct seq_file *m, struct irq_data *data, int ind)
 		   data->domain ? data->domain->name : "");
 	seq_printf(m, "%*shwirq:   0x%lx\n", ind + 1, "", data->hwirq);
 	irq_debug_show_chip(m, data, ind + 1);
+	if (data->domain && data->domain->ops && data->domain->ops->debug_show)
+		data->domain->ops->debug_show(m, NULL, data, ind + 1);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	if (!data->parent_data)
 		return;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..1a75a00ee01f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1810,6 +1810,8 @@ irq_domain_debug_show_one(struct seq_file *m, struct irq_domain *d, int ind)
 		   d->revmap_size + d->revmap_direct_max_irq);
 	seq_printf(m, "%*smapped: %u\n", ind + 1, "", d->mapcount);
 	seq_printf(m, "%*sflags:  0x%08x\n", ind +1 , "", d->flags);
+	if (d->ops && d->ops->debug_show)
+		d->ops->debug_show(m, d, NULL, ind + 1);
 #ifdef	CONFIG_IRQ_DOMAIN_HIERARCHY
 	if (!d->parent)
 		return;
-- 
cgit v1.3-14-g43fede


From 457f6d35072f395508255ef09fd08f824382cf85 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:07 +0200
Subject: genirq: Make state consistent for !IRQ_DOMAIN_HIERARCHY

In the !IRQ_DOMAIN_HIERARCHY cas the activation stubs are not
setting/clearing the activation status bits. This is not a problem at the
moment, but upcoming changes require a correct status.

Add the set/clear incovations to the stub functions and move them to the
core internal header to avoid duplication and visibility outside the core.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.591985591@linutronix.de
---
 include/linux/irqdomain.h |  4 ----
 kernel/irq/internals.h    | 11 +++++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 1fb50438a868..de06ce992a2d 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -511,8 +511,6 @@ static inline bool irq_domain_is_msi_remap(struct irq_domain *domain)
 extern bool irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain);
 
 #else	/* CONFIG_IRQ_DOMAIN_HIERARCHY */
-static inline void irq_domain_activate_irq(struct irq_data *data) { }
-static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
 			unsigned int nr_irqs, int node, void *arg)
 {
@@ -561,8 +559,6 @@ irq_domain_hierarchical_is_msi_remap(struct irq_domain *domain)
 
 #else /* CONFIG_IRQ_DOMAIN */
 static inline void irq_dispose_mapping(unsigned int virq) { }
-static inline void irq_domain_activate_irq(struct irq_data *data) { }
-static inline void irq_domain_deactivate_irq(struct irq_data *data) { }
 static inline struct irq_domain *irq_find_matching_fwnode(
 	struct fwnode_handle *fwnode, enum irq_domain_bus_token bus_token)
 {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index cfaec2669093..72b8da2a7c39 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -436,6 +436,17 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 }
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
+#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
+static inline void irq_domain_activate_irq(struct irq_data *data)
+{
+	irqd_set_activated(data);
+}
+static inline void irq_domain_deactivate_irq(struct irq_data *data)
+{
+	irqd_clr_activated(data);
+}
+#endif
+
 #ifdef CONFIG_GENERIC_IRQ_DEBUGFS
 #include <linux/debugfs.h>
 
-- 
cgit v1.3-14-g43fede


From 239306fee8a59beb728faf8f42b13b2250b24841 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:08 +0200
Subject: genirq: Set managed shut down flag at init

Managed interrupts should start up in managed shutdown mode. Set the status
flag when initialising the irq descriptor.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.669687742@linutronix.de
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0f6805b75fc8..982a3576fb01 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -448,7 +448,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 		}
 	}
 
-	flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
 	mask = NULL;
 
 	for (i = 0; i < cnt; i++) {
-- 
cgit v1.3-14-g43fede


From c942cee46bba761ce97ee6d4fc71892e064e8628 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:09 +0200
Subject: genirq: Separate activation and startup

Activation of an interrupt and startup are currently a combo
functionlity. That works so far, but upcoming changes require a strict
separation because the activation can fail in future.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.754334077@linutronix.de
---
 kernel/irq/autoprobe.c |  2 +-
 kernel/irq/chip.c      | 30 ++++++++++++++++++++++++------
 kernel/irq/internals.h |  2 ++
 kernel/irq/manage.c    | 17 ++++++++++++++++-
 4 files changed, 43 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index d30a0dd5cc02..6608d03efb23 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
 			if (desc->irq_data.chip->irq_set_type)
 				desc->irq_data.chip->irq_set_type(&desc->irq_data,
 							 IRQ_TYPE_PROBE);
-			irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE);
+			irq_activate_and_startup(desc, IRQ_NORESEND);
 		}
 		raw_spin_unlock_irq(&desc->lock);
 	}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6fc89fd93824..37dd34d922f4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -207,20 +207,19 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 		 * Catch code which fiddles with enable_irq() on a managed
 		 * and potentially shutdown IRQ. Chained interrupt
 		 * installment or irq auto probing should not happen on
-		 * managed irqs either. Emit a warning, break the affinity
-		 * and start it up as a normal interrupt.
+		 * managed irqs either.
 		 */
 		if (WARN_ON_ONCE(force))
-			return IRQ_STARTUP_NORMAL;
+			return IRQ_STARTUP_ABORT;
 		/*
 		 * The interrupt was requested, but there is no online CPU
 		 * in it's affinity mask. Put it into managed shutdown
 		 * state and let the cpu hotplug mechanism start it up once
 		 * a CPU in the mask becomes available.
 		 */
-		irqd_set_managed_shutdown(d);
 		return IRQ_STARTUP_ABORT;
 	}
+	irq_domain_activate_irq(d);
 	return IRQ_STARTUP_MANAGED;
 }
 #else
@@ -236,7 +235,9 @@ static int __irq_startup(struct irq_desc *desc)
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 	int ret = 0;
 
-	irq_domain_activate_irq(d);
+	/* Warn if this interrupt is not activated but try nevertheless */
+	WARN_ON_ONCE(!irqd_is_activated(d));
+
 	if (d->chip->irq_startup) {
 		ret = d->chip->irq_startup(d);
 		irq_state_clr_disabled(desc);
@@ -269,6 +270,7 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
 			irq_set_affinity_locked(d, aff, false);
 			break;
 		case IRQ_STARTUP_ABORT:
+			irqd_set_managed_shutdown(d);
 			return 0;
 		}
 	}
@@ -278,6 +280,22 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force)
 	return ret;
 }
 
+int irq_activate(struct irq_desc *desc)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+
+	if (!irqd_affinity_is_managed(d))
+		irq_domain_activate_irq(d);
+	return 0;
+}
+
+void irq_activate_and_startup(struct irq_desc *desc, bool resend)
+{
+	if (WARN_ON(irq_activate(desc)))
+		return;
+	irq_startup(desc, resend, IRQ_START_FORCE);
+}
+
 static void __irq_disable(struct irq_desc *desc, bool mask);
 
 void irq_shutdown(struct irq_desc *desc)
@@ -953,7 +971,7 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 		irq_settings_set_norequest(desc);
 		irq_settings_set_nothread(desc);
 		desc->action = &chained_action;
-		irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
+		irq_activate_and_startup(desc, IRQ_RESEND);
 	}
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 72b8da2a7c39..8bd131e4c944 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -74,6 +74,8 @@ extern void __enable_irq(struct irq_desc *desc);
 #define IRQ_START_FORCE	true
 #define IRQ_START_COND	false
 
+extern int irq_activate(struct irq_desc *desc);
+extern void irq_activate_and_startup(struct irq_desc *desc, bool resend);
 extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
 
 extern void irq_shutdown(struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0e8b48315f3c..e667912d0e9c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -519,7 +519,7 @@ void __enable_irq(struct irq_desc *desc)
 		 * time. If it was already started up, then irq_startup()
 		 * will invoke irq_enable() under the hood.
 		 */
-		irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
+		irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
 		break;
 	}
 	default:
@@ -1325,6 +1325,21 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				goto out_unlock;
 		}
 
+		/*
+		 * Activate the interrupt. That activation must happen
+		 * independently of IRQ_NOAUTOEN. request_irq() can fail
+		 * and the callers are supposed to handle
+		 * that. enable_irq() of an interrupt requested with
+		 * IRQ_NOAUTOEN is not supposed to fail. The activation
+		 * keeps it in shutdown mode, it merily associates
+		 * resources if necessary and if that's not possible it
+		 * fails. Interrupts which are in managed shutdown mode
+		 * will simply ignore that activation request.
+		 */
+		ret = irq_activate(desc);
+		if (ret)
+			goto out_unlock;
+
 		desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
 				  IRQS_ONESHOT | IRQS_WAITING);
 		irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
-- 
cgit v1.3-14-g43fede


From 72491643469aab736536ae71dcd199b19dabd891 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:10 +0200
Subject: genirq/irqdomain: Update irq_domain_ops.activate() signature

The irq_domain_ops.activate() callback has no return value and no way to
tell the function that the activation is early.

The upcoming changes to support a reservation scheme which allows to assign
interrupt vectors on x86 only when the interrupt is actually requested
requires:

  - A return value, so activation can fail at request_irq() time

  - Information that the activate invocation is early, i.e. before
    request_irq().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.848490816@linutronix.de
---
 arch/x86/include/asm/irqdomain.h      |  4 ++--
 arch/x86/kernel/apic/htirq.c          |  5 +++--
 arch/x86/kernel/apic/io_apic.c        |  5 +++--
 arch/x86/platform/uv/uv_irq.c         |  5 +++--
 drivers/gpio/gpio-xgene-sb.c          |  8 +++++---
 drivers/iommu/amd_iommu.c             |  5 +++--
 drivers/iommu/intel_irq_remapping.c   |  5 +++--
 drivers/irqchip/irq-gic-v3-its.c      | 10 ++++++----
 drivers/pinctrl/stm32/pinctrl-stm32.c |  5 +++--
 include/linux/irqdomain.h             |  2 +-
 kernel/irq/irqdomain.c                |  2 +-
 kernel/irq/msi.c                      |  5 +++--
 12 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index d26075b52885..1d9091ffa140 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -41,8 +41,8 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs, void *arg);
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
-extern void mp_irqdomain_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data);
+extern int mp_irqdomain_activate(struct irq_domain *domain,
+				 struct irq_data *irq_data, bool early);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
index 56ccf9346b08..b07075dce8b7 100644
--- a/arch/x86/kernel/apic/htirq.c
+++ b/arch/x86/kernel/apic/htirq.c
@@ -112,8 +112,8 @@ static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-static void htirq_domain_activate(struct irq_domain *domain,
-				  struct irq_data *irq_data)
+static int htirq_domain_activate(struct irq_domain *domain,
+				 struct irq_data *irq_data, bool early)
 {
 	struct ht_irq_msg msg;
 	struct irq_cfg *cfg = irqd_cfg(irq_data);
@@ -132,6 +132,7 @@ static void htirq_domain_activate(struct irq_domain *domain,
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
 	write_ht_irq_msg(irq_data->irq, &msg);
+	return 0;
 }
 
 static void htirq_domain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 70e48aa6af98..d50e46757f6d 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2977,8 +2977,8 @@ void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 
-void mp_irqdomain_activate(struct irq_domain *domain,
-			   struct irq_data *irq_data)
+int mp_irqdomain_activate(struct irq_domain *domain,
+			  struct irq_data *irq_data, bool early)
 {
 	unsigned long flags;
 	struct irq_pin_list *entry;
@@ -2988,6 +2988,7 @@ void mp_irqdomain_activate(struct irq_domain *domain,
 	for_each_irq_pin(entry, data->irq_2_pin)
 		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+	return 0;
 }
 
 void mp_irqdomain_deactivate(struct irq_domain *domain,
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 03fc397335b7..5f6fd860820a 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -127,10 +127,11 @@ static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
-static void uv_domain_activate(struct irq_domain *domain,
-			       struct irq_data *irq_data)
+static int uv_domain_activate(struct irq_domain *domain,
+			      struct irq_data *irq_data, bool early)
 {
 	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
+	return 0;
 }
 
 /*
diff --git a/drivers/gpio/gpio-xgene-sb.c b/drivers/gpio/gpio-xgene-sb.c
index 033258634b8c..b5843fe6a44d 100644
--- a/drivers/gpio/gpio-xgene-sb.c
+++ b/drivers/gpio/gpio-xgene-sb.c
@@ -140,8 +140,9 @@ static int xgene_gpio_sb_to_irq(struct gpio_chip *gc, u32 gpio)
 	return irq_create_fwspec_mapping(&fwspec);
 }
 
-static void xgene_gpio_sb_domain_activate(struct irq_domain *d,
-		struct irq_data *irq_data)
+static int xgene_gpio_sb_domain_activate(struct irq_domain *d,
+					 struct irq_data *irq_data,
+					 bool early)
 {
 	struct xgene_gpio_sb *priv = d->host_data;
 	u32 gpio = HWIRQ_TO_GPIO(priv, irq_data->hwirq);
@@ -150,11 +151,12 @@ static void xgene_gpio_sb_domain_activate(struct irq_domain *d,
 		dev_err(priv->gc.parent,
 		"Unable to configure XGene GPIO standby pin %d as IRQ\n",
 				gpio);
-		return;
+		return -ENOSPC;
 	}
 
 	xgene_gpio_set_bit(&priv->gc, priv->regs + MPA_GPIO_SEL_LO,
 			gpio * 2, 1);
+	return 0;
 }
 
 static void xgene_gpio_sb_domain_deactivate(struct irq_domain *d,
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 51f8215877f5..ea03f4138f5f 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -4170,8 +4170,8 @@ static void irq_remapping_free(struct irq_domain *domain, unsigned int virq,
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
-static void irq_remapping_activate(struct irq_domain *domain,
-				   struct irq_data *irq_data)
+static int irq_remapping_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data, bool early)
 {
 	struct amd_ir_data *data = irq_data->chip_data;
 	struct irq_2_irte *irte_info = &data->irq_2_irte;
@@ -4180,6 +4180,7 @@ static void irq_remapping_activate(struct irq_domain *domain,
 	if (iommu)
 		iommu->irte_ops->activate(data->entry, irte_info->devid,
 					  irte_info->index);
+	return 0;
 }
 
 static void irq_remapping_deactivate(struct irq_domain *domain,
diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c
index a5b89f6bcdbf..762d84713b7a 100644
--- a/drivers/iommu/intel_irq_remapping.c
+++ b/drivers/iommu/intel_irq_remapping.c
@@ -1389,12 +1389,13 @@ static void intel_irq_remapping_free(struct irq_domain *domain,
 	irq_domain_free_irqs_common(domain, virq, nr_irqs);
 }
 
-static void intel_irq_remapping_activate(struct irq_domain *domain,
-					 struct irq_data *irq_data)
+static int intel_irq_remapping_activate(struct irq_domain *domain,
+					struct irq_data *irq_data, bool early)
 {
 	struct intel_ir_data *data = irq_data->chip_data;
 
 	modify_irte(&data->irq_2_iommu, &data->irte_entry);
+	return 0;
 }
 
 static void intel_irq_remapping_deactivate(struct irq_domain *domain,
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index e8d89343d613..20e2b5fac7b9 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -2186,8 +2186,8 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	return 0;
 }
 
-static void its_irq_domain_activate(struct irq_domain *domain,
-				    struct irq_data *d)
+static int its_irq_domain_activate(struct irq_domain *domain,
+				   struct irq_data *d, bool early)
 {
 	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
 	u32 event = its_get_event_id(d);
@@ -2205,6 +2205,7 @@ static void its_irq_domain_activate(struct irq_domain *domain,
 
 	/* Map the GIC IRQ and event to the device */
 	its_send_mapti(its_dev, d->hwirq, event);
+	return 0;
 }
 
 static void its_irq_domain_deactivate(struct irq_domain *domain,
@@ -2678,8 +2679,8 @@ static int its_vpe_irq_domain_alloc(struct irq_domain *domain, unsigned int virq
 	return err;
 }
 
-static void its_vpe_irq_domain_activate(struct irq_domain *domain,
-					struct irq_data *d)
+static int its_vpe_irq_domain_activate(struct irq_domain *domain,
+				       struct irq_data *d, bool early)
 {
 	struct its_vpe *vpe = irq_data_get_irq_chip_data(d);
 
@@ -2687,6 +2688,7 @@ static void its_vpe_irq_domain_activate(struct irq_domain *domain,
 	vpe->col_idx = cpumask_first(cpu_online_mask);
 	its_send_vmapp(vpe, true);
 	its_send_vinvall(vpe);
+	return 0;
 }
 
 static void its_vpe_irq_domain_deactivate(struct irq_domain *domain,
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32.c b/drivers/pinctrl/stm32/pinctrl-stm32.c
index 50299ad96659..02b66588cac6 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32.c
@@ -289,13 +289,14 @@ static int stm32_gpio_domain_translate(struct irq_domain *d,
 	return 0;
 }
 
-static void stm32_gpio_domain_activate(struct irq_domain *d,
-				       struct irq_data *irq_data)
+static int stm32_gpio_domain_activate(struct irq_domain *d,
+				      struct irq_data *irq_data, bool early)
 {
 	struct stm32_gpio_bank *bank = d->host_data;
 	struct stm32_pinctrl *pctl = dev_get_drvdata(bank->gpio_chip.parent);
 
 	regmap_field_write(pctl->irqmux[irq_data->hwirq], bank->bank_nr);
+	return 0;
 }
 
 static int stm32_gpio_domain_alloc(struct irq_domain *d,
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index de06ce992a2d..9bc07f59b090 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -111,7 +111,7 @@ struct irq_domain_ops {
 		     unsigned int nr_irqs, void *arg);
 	void (*free)(struct irq_domain *d, unsigned int virq,
 		     unsigned int nr_irqs);
-	void (*activate)(struct irq_domain *d, struct irq_data *irq_data);
+	int (*activate)(struct irq_domain *d, struct irq_data *irqd, bool early);
 	void (*deactivate)(struct irq_domain *d, struct irq_data *irq_data);
 	int (*translate)(struct irq_domain *d, struct irq_fwspec *fwspec,
 			 unsigned long *out_hwirq, unsigned int *out_type);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1a75a00ee01f..1423f8a6c75d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1690,7 +1690,7 @@ static void __irq_domain_activate_irq(struct irq_data *irq_data)
 		if (irq_data->parent_data)
 			__irq_domain_activate_irq(irq_data->parent_data);
 		if (domain->ops->activate)
-			domain->ops->activate(domain, irq_data);
+			domain->ops->activate(domain, irq_data, false);
 	}
 }
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 94ecc0293844..6155fff9f647 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -102,13 +102,14 @@ int msi_domain_set_affinity(struct irq_data *irq_data,
 	return ret;
 }
 
-static void msi_domain_activate(struct irq_domain *domain,
-				struct irq_data *irq_data)
+static int msi_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data, bool early)
 {
 	struct msi_msg msg;
 
 	BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg));
 	irq_chip_write_msi_msg(irq_data, &msg);
+	return 0;
 }
 
 static void msi_domain_deactivate(struct irq_domain *domain,
-- 
cgit v1.3-14-g43fede


From bb9b428a5c832d7abb494fbabac37c515c01c6c4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:11 +0200
Subject: genirq/irqdomain: Allow irq_domain_activate_irq() to fail

Allow irq_domain_activate_irq() to fail. This is required to support a
reservation and late vector assignment scheme.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213152.933882227@linutronix.de
---
 include/linux/irqdomain.h |  2 +-
 kernel/irq/chip.c         |  9 +++++++--
 kernel/irq/internals.h    |  3 ++-
 kernel/irq/irqdomain.c    | 40 +++++++++++++++++++++++++---------------
 kernel/irq/msi.c          | 19 +++++++++++++++++--
 5 files changed, 52 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 9bc07f59b090..8fb877121984 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
 				   bool realloc, const struct cpumask *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
-extern void irq_domain_activate_irq(struct irq_data *irq_data);
+extern int irq_domain_activate_irq(struct irq_data *irq_data);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
 
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 37dd34d922f4..cd5b3eb38082 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -219,7 +219,12 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 		 */
 		return IRQ_STARTUP_ABORT;
 	}
-	irq_domain_activate_irq(d);
+	/*
+	 * Managed interrupts have reserved resources, so this should not
+	 * happen.
+	 */
+	if (WARN_ON(irq_domain_activate_irq(d)))
+		return IRQ_STARTUP_ABORT;
 	return IRQ_STARTUP_MANAGED;
 }
 #else
@@ -285,7 +290,7 @@ int irq_activate(struct irq_desc *desc)
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 
 	if (!irqd_affinity_is_managed(d))
-		irq_domain_activate_irq(d);
+		return irq_domain_activate_irq(d);
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 8bd131e4c944..e84d0e3899f6 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -439,9 +439,10 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline void irq_domain_activate_irq(struct irq_data *data)
+static inline int irq_domain_activate_irq(struct irq_data *data)
 {
 	irqd_set_activated(data);
+	return 0;
 }
 static inline void irq_domain_deactivate_irq(struct irq_data *data)
 {
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1423f8a6c75d..47e8ddd9e8cf 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1682,28 +1682,35 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
 }
 EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
 
-static void __irq_domain_activate_irq(struct irq_data *irq_data)
+static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 {
 	if (irq_data && irq_data->domain) {
 		struct irq_domain *domain = irq_data->domain;
 
+		if (domain->ops->deactivate)
+			domain->ops->deactivate(domain, irq_data);
 		if (irq_data->parent_data)
-			__irq_domain_activate_irq(irq_data->parent_data);
-		if (domain->ops->activate)
-			domain->ops->activate(domain, irq_data, false);
+			__irq_domain_deactivate_irq(irq_data->parent_data);
 	}
 }
 
-static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
+static int __irq_domain_activate_irq(struct irq_data *irqd)
 {
-	if (irq_data && irq_data->domain) {
-		struct irq_domain *domain = irq_data->domain;
+	int ret = 0;
 
-		if (domain->ops->deactivate)
-			domain->ops->deactivate(domain, irq_data);
-		if (irq_data->parent_data)
-			__irq_domain_deactivate_irq(irq_data->parent_data);
+	if (irqd && irqd->domain) {
+		struct irq_domain *domain = irqd->domain;
+
+		if (irqd->parent_data)
+			ret = __irq_domain_activate_irq(irqd->parent_data);
+		if (!ret && domain->ops->activate) {
+			ret = domain->ops->activate(domain, irqd, false);
+			/* Rollback in case of error */
+			if (ret && irqd->parent_data)
+				__irq_domain_deactivate_irq(irqd->parent_data);
+		}
 	}
+	return ret;
 }
 
 /**
@@ -1714,12 +1721,15 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-void irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data)
 {
-	if (!irqd_is_activated(irq_data)) {
-		__irq_domain_activate_irq(irq_data);
+	int ret = 0;
+
+	if (!irqd_is_activated(irq_data))
+		ret = __irq_domain_activate_irq(irq_data);
+	if (!ret)
 		irqd_set_activated(irq_data);
-	}
+	return ret;
 }
 
 /**
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6155fff9f647..5ece369950ec 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -401,11 +401,26 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			struct irq_data *irq_data;
 
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			irq_domain_activate_irq(irq_data);
+			ret = irq_domain_activate_irq(irq_data);
+			if (ret)
+				goto cleanup;
 		}
 	}
-
 	return 0;
+
+cleanup:
+	for_each_msi_entry(desc, dev) {
+		struct irq_data *irqd;
+
+		if (desc->irq == virq)
+			break;
+
+		irqd = irq_domain_get_irq_data(domain, desc->irq);
+		if (irqd_is_activated(irqd))
+			irq_domain_deactivate_irq(irqd);
+	}
+	msi_domain_free_irqs(domain, dev);
+	return ret;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 42e1cc2dc5b698181ab1ffb7972bd880230c506e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:12 +0200
Subject: genirq/irqdomain: Propagate early activation

Propagate the early activation mode to the irqdomain activate()
callbacks. This is required for the upcoming reservation, late vector
assignment scheme, so that the early activation call can act accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.028353660@linutronix.de
---
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 include/linux/irqdomain.h      |  2 +-
 kernel/irq/chip.c              |  4 ++--
 kernel/irq/internals.h         |  2 +-
 kernel/irq/irqdomain.c         | 11 ++++++-----
 kernel/irq/msi.c               |  2 +-
 6 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d50e46757f6d..6f1007fd9783 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2096,7 +2096,7 @@ static inline void __init check_timer(void)
 				unmask_ioapic_irq(irq_get_irq_data(0));
 		}
 		irq_domain_deactivate_irq(irq_data);
-		irq_domain_activate_irq(irq_data);
+		irq_domain_activate_irq(irq_data, false);
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
@@ -2118,7 +2118,7 @@ static inline void __init check_timer(void)
 		 */
 		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
 		irq_domain_deactivate_irq(irq_data);
-		irq_domain_activate_irq(irq_data);
+		irq_domain_activate_irq(irq_data, false);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 8fb877121984..7d0c6c144708 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -441,7 +441,7 @@ extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
 				   bool realloc, const struct cpumask *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
-extern int irq_domain_activate_irq(struct irq_data *irq_data);
+extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
 
 static inline int irq_domain_alloc_irqs(struct irq_domain *domain,
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index cd5b3eb38082..82333835ac66 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -223,7 +223,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
 	 * Managed interrupts have reserved resources, so this should not
 	 * happen.
 	 */
-	if (WARN_ON(irq_domain_activate_irq(d)))
+	if (WARN_ON(irq_domain_activate_irq(d, false)))
 		return IRQ_STARTUP_ABORT;
 	return IRQ_STARTUP_MANAGED;
 }
@@ -290,7 +290,7 @@ int irq_activate(struct irq_desc *desc)
 	struct irq_data *d = irq_desc_get_irq_data(desc);
 
 	if (!irqd_affinity_is_managed(d))
-		return irq_domain_activate_irq(d);
+		return irq_domain_activate_irq(d, false);
 	return 0;
 }
 
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e84d0e3899f6..a0327136e469 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -439,7 +439,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
 #endif /* !CONFIG_GENERIC_PENDING_IRQ */
 
 #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
-static inline int irq_domain_activate_irq(struct irq_data *data)
+static inline int irq_domain_activate_irq(struct irq_data *data, bool early)
 {
 	irqd_set_activated(data);
 	return 0;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 47e8ddd9e8cf..b50f737574ae 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1694,7 +1694,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data)
 	}
 }
 
-static int __irq_domain_activate_irq(struct irq_data *irqd)
+static int __irq_domain_activate_irq(struct irq_data *irqd, bool early)
 {
 	int ret = 0;
 
@@ -1702,9 +1702,10 @@ static int __irq_domain_activate_irq(struct irq_data *irqd)
 		struct irq_domain *domain = irqd->domain;
 
 		if (irqd->parent_data)
-			ret = __irq_domain_activate_irq(irqd->parent_data);
+			ret = __irq_domain_activate_irq(irqd->parent_data,
+							early);
 		if (!ret && domain->ops->activate) {
-			ret = domain->ops->activate(domain, irqd, false);
+			ret = domain->ops->activate(domain, irqd, early);
 			/* Rollback in case of error */
 			if (ret && irqd->parent_data)
 				__irq_domain_deactivate_irq(irqd->parent_data);
@@ -1721,12 +1722,12 @@ static int __irq_domain_activate_irq(struct irq_data *irqd)
  * This is the second step to call domain_ops->activate to program interrupt
  * controllers, so the interrupt could actually get delivered.
  */
-int irq_domain_activate_irq(struct irq_data *irq_data)
+int irq_domain_activate_irq(struct irq_data *irq_data, bool early)
 {
 	int ret = 0;
 
 	if (!irqd_is_activated(irq_data))
-		ret = __irq_domain_activate_irq(irq_data);
+		ret = __irq_domain_activate_irq(irq_data, early);
 	if (!ret)
 		irqd_set_activated(irq_data);
 	return ret;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 5ece369950ec..d7553d015175 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -401,7 +401,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			struct irq_data *irq_data;
 
 			irq_data = irq_domain_get_irq_data(domain, desc->irq);
-			ret = irq_domain_activate_irq(irq_data);
+			ret = irq_domain_activate_irq(irq_data, true);
 			if (ret)
 				goto cleanup;
 		}
-- 
cgit v1.3-14-g43fede


From 22d0b12f3560d3b3264ee79faa1c05a5060fb916 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:13 +0200
Subject: genirq/irqdomain: Add force reactivation flag to irq domains

Allow irqdomains to tell the core code, that after early activation the
interrupt needs to be reactivated at request_irq() time.

This allows reservation of vectors at early activation time and actual
vector assignment at request_irq() time.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.106242536@linutronix.de
---
 include/linux/msi.h | 5 +++++
 kernel/irq/msi.c    | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index 80e3b562bef6..eff16ef81f43 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -283,6 +283,11 @@ enum {
 	MSI_FLAG_PCI_MSIX		= (1 << 3),
 	/* Needs early activate, required for PCI */
 	MSI_FLAG_ACTIVATE_EARLY		= (1 << 4),
+	/*
+	 * Must reactivate when irq is started even when
+	 * MSI_FLAG_ACTIVATE_EARLY has been set.
+	 */
+	MSI_FLAG_MUST_REACTIVATE	= (1 << 5),
 };
 
 int msi_domain_set_affinity(struct irq_data *data, const struct cpumask *mask,
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index d7553d015175..edb987b2c58d 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -404,6 +404,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
 			ret = irq_domain_activate_irq(irq_data, true);
 			if (ret)
 				goto cleanup;
+			if (info->flags & MSI_FLAG_MUST_REACTIVATE)
+				irqd_clr_activated(irq_data);
 		}
 	}
 	return 0;
-- 
cgit v1.3-14-g43fede


From 2f75d9e1c90511bff6d1ce4de94503cc28fec032 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:14 +0200
Subject: genirq: Implement bitmap matrix allocator

Implement the infrastructure for a simple bitmap based allocator, which
will replace the x86 vector allocator. It's in the core code as other
architectures might be able to reuse/extend it. For now it only implements
allocations for single CPUs, but it's simple to add multi CPU allocation
support if required.

The concept is rather simple:

 Global information:
 	system_vector bitmap
	global accounting

 PerCPU information:
 	allocation bitmap
	managed allocation bitmap
	local accounting

The system vector bitmap is used to exclude vectors system wide from the
allocation space.

The allocation bitmap is used to keep track of per cpu used vectors.

The managed allocation bitmap is used to reserve vectors for managed
interrupts.

When a regular (non managed) interrupt allocation happens then the
following rule applies:

      tmpmap = system_map | alloc_map | managed_map
      find_zero_bit(tmpmap)

Oring the bitmaps together gives the real available space. The same rule
applies for reserving a managed interrupt vector. But contrary to the
regular interrupts the reservation only marks the bit in the managed map
and therefor excludes it from the regular allocations. The managed map is
only cleaned out when the a managed interrupt is completely released and it
stays alive accross CPU offline/online operations.

For managed interrupt allocations the rule is:

      tmpmap = managed_map & ~alloc_map
      find_first_bit(tmpmap)

This returns the first bit which is in the managed map, but not yet
allocated in the allocation map. The allocation marks it in the allocation
map and hands it back to the caller for use.

The rest of the code are helper functions to handle the various
requirements and the accounting which are necessary to replace the x86
vector allocation code. The result is a single patch as the evolution of
this infrastructure cannot be represented in bits and pieces.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.185437174@linutronix.de
---
 include/linux/irq.h |  22 +++
 kernel/irq/Kconfig  |   3 +
 kernel/irq/Makefile |   1 +
 kernel/irq/matrix.c | 428 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 454 insertions(+)
 create mode 100644 kernel/irq/matrix.c

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d4728bf6a537..fda8da7c45e7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -1113,6 +1113,28 @@ static inline u32 irq_reg_readl(struct irq_chip_generic *gc,
 		return readl(gc->reg_base + reg_offset);
 }
 
+struct irq_matrix;
+struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+				    unsigned int alloc_start,
+				    unsigned int alloc_end);
+void irq_matrix_online(struct irq_matrix *m);
+void irq_matrix_offline(struct irq_matrix *m);
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit, bool replace);
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk);
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk);
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu);
+void irq_matrix_reserve(struct irq_matrix *m);
+void irq_matrix_remove_reserved(struct irq_matrix *m);
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+		     bool reserved, unsigned int *mapped_cpu);
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+		     unsigned int bit, bool managed);
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit);
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown);
+unsigned int irq_matrix_allocated(struct irq_matrix *m);
+unsigned int irq_matrix_reserved(struct irq_matrix *m);
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind);
+
 /* Contrary to Linux irqs, for hardware irqs the irq number 0 is valid */
 #define INVALID_HWIRQ	(~0UL)
 irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index a117adf7084b..ac1a3e29d3b9 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -97,6 +97,9 @@ config HANDLE_DOMAIN_IRQ
 config IRQ_TIMINGS
 	bool
 
+config GENERIC_IRQ_MATRIX_ALLOCATOR
+	bool
+
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
 	depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 1970cafe8f2a..329c9193b4bf 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -13,3 +13,4 @@ obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
 obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
 obj-$(CONFIG_SMP) += affinity.o
 obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
+obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
new file mode 100644
index 000000000000..7b2b4fbde1e2
--- /dev/null
+++ b/kernel/irq/matrix.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright (C) 2017 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * SPDX-License-Identifier: GPL-2.0
+ */
+#include <linux/spinlock.h>
+#include <linux/seq_file.h>
+#include <linux/bitmap.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/irq.h>
+
+#define IRQ_MATRIX_SIZE	(BITS_TO_LONGS(IRQ_MATRIX_BITS) * sizeof(unsigned long))
+
+struct cpumap {
+	unsigned int		available;
+	unsigned int		allocated;
+	unsigned int		managed;
+	bool			online;
+	unsigned long		alloc_map[IRQ_MATRIX_SIZE];
+	unsigned long		managed_map[IRQ_MATRIX_SIZE];
+};
+
+struct irq_matrix {
+	unsigned int		matrix_bits;
+	unsigned int		alloc_start;
+	unsigned int		alloc_end;
+	unsigned int		alloc_size;
+	unsigned int		global_available;
+	unsigned int		global_reserved;
+	unsigned int		systembits_inalloc;
+	unsigned int		total_allocated;
+	unsigned int		online_maps;
+	struct cpumap __percpu	*maps;
+	unsigned long		scratch_map[IRQ_MATRIX_SIZE];
+	unsigned long		system_map[IRQ_MATRIX_SIZE];
+};
+
+/**
+ * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
+ * @matrix_bits:	Number of matrix bits must be <= IRQ_MATRIX_BITS
+ * @alloc_start:	From which bit the allocation search starts
+ * @alloc_end:		At which bit the allocation search ends, i.e first
+ *			invalid bit
+ */
+__init struct irq_matrix *irq_alloc_matrix(unsigned int matrix_bits,
+					   unsigned int alloc_start,
+					   unsigned int alloc_end)
+{
+	struct irq_matrix *m;
+
+	if (matrix_bits > IRQ_MATRIX_BITS)
+		return NULL;
+
+	m = kzalloc(sizeof(*m), GFP_KERNEL);
+	if (!m)
+		return NULL;
+
+	m->matrix_bits = matrix_bits;
+	m->alloc_start = alloc_start;
+	m->alloc_end = alloc_end;
+	m->alloc_size = alloc_end - alloc_start;
+	m->maps = alloc_percpu(*m->maps);
+	if (!m->maps) {
+		kfree(m);
+		return NULL;
+	}
+	return m;
+}
+
+/**
+ * irq_matrix_online - Bring the local CPU matrix online
+ * @m:		Matrix pointer
+ */
+void irq_matrix_online(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	BUG_ON(cm->online);
+
+	bitmap_zero(cm->alloc_map, m->matrix_bits);
+	cm->available = m->alloc_size - (cm->managed + m->systembits_inalloc);
+	cm->allocated = 0;
+	m->global_available += cm->available;
+	cm->online = true;
+	m->online_maps++;
+}
+
+/**
+ * irq_matrix_offline - Bring the local CPU matrix offline
+ * @m:		Matrix pointer
+ */
+void irq_matrix_offline(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	/* Update the global available size */
+	m->global_available -= cm->available;
+	cm->online = false;
+	m->online_maps--;
+}
+
+static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
+				      unsigned int num, bool managed)
+{
+	unsigned int area, start = m->alloc_start;
+	unsigned int end = m->alloc_end;
+
+	bitmap_or(m->scratch_map, cm->managed_map, m->system_map, end);
+	bitmap_or(m->scratch_map, m->scratch_map, cm->alloc_map, end);
+	area = bitmap_find_next_zero_area(m->scratch_map, end, start, num, 0);
+	if (area >= end)
+		return area;
+	if (managed)
+		bitmap_set(cm->managed_map, area, num);
+	else
+		bitmap_set(cm->alloc_map, area, num);
+	return area;
+}
+
+/**
+ * irq_matrix_assign_system - Assign system wide entry in the matrix
+ * @m:		Matrix pointer
+ * @bit:	Which bit to reserve
+ * @replace:	Replace an already allocated vector with a system
+ *		vector at the same bit position.
+ *
+ * The BUG_ON()s below are on purpose. If this goes wrong in the
+ * early boot process, then the chance to survive is about zero.
+ * If this happens when the system is life, it's not much better.
+ */
+void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
+			      bool replace)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	BUG_ON(bit > m->matrix_bits);
+	BUG_ON(m->online_maps > 1 || (m->online_maps && !replace));
+
+	set_bit(bit, m->system_map);
+	if (replace) {
+		BUG_ON(!test_and_clear_bit(bit, cm->alloc_map));
+		cm->allocated--;
+		m->total_allocated--;
+	}
+	if (bit >= m->alloc_start && bit < m->alloc_end)
+		m->systembits_inalloc++;
+}
+
+/**
+ * irq_matrix_reserve_managed - Reserve a managed interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	On which CPUs the bits should be reserved.
+ *
+ * Can be called for offline CPUs. Note, this will only reserve one bit
+ * on all CPUs in @msk, but it's not guaranteed that the bits are at the
+ * same offset on all CPUs
+ */
+int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+	unsigned int cpu, failed_cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit;
+
+		bit = matrix_alloc_area(m, cm, 1, true);
+		if (bit >= m->alloc_end)
+			goto cleanup;
+		cm->managed++;
+		if (cm->online) {
+			cm->available--;
+			m->global_available--;
+		}
+	}
+	return 0;
+cleanup:
+	failed_cpu = cpu;
+	for_each_cpu(cpu, msk) {
+		if (cpu == failed_cpu)
+			break;
+		irq_matrix_remove_managed(m, cpumask_of(cpu));
+	}
+	return -ENOSPC;
+}
+
+/**
+ * irq_matrix_remove_managed - Remove managed interrupts in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	On which CPUs the bits should be removed
+ *
+ * Can be called for offline CPUs
+ *
+ * This removes not allocated managed interrupts from the map. It does
+ * not matter which one because the managed interrupts free their
+ * allocation when they shut down. If not, the accounting is screwed,
+ * but all what can be done at this point is warn about it.
+ */
+void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit, end = m->alloc_end;
+
+		if (WARN_ON_ONCE(!cm->managed))
+			continue;
+
+		/* Get managed bit which are not allocated */
+		bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+
+		bit = find_first_bit(m->scratch_map, end);
+		if (WARN_ON_ONCE(bit >= end))
+			continue;
+
+		clear_bit(bit, cm->managed_map);
+
+		cm->managed--;
+		if (cm->online) {
+			cm->available++;
+			m->global_available++;
+		}
+	}
+}
+
+/**
+ * irq_matrix_alloc_managed - Allocate a managed interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @cpu:	On which CPU the interrupt should be allocated
+ */
+int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
+{
+	struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+	unsigned int bit, end = m->alloc_end;
+
+	/* Get managed bit which are not allocated */
+	bitmap_andnot(m->scratch_map, cm->managed_map, cm->alloc_map, end);
+	bit = find_first_bit(m->scratch_map, end);
+	if (bit >= end)
+		return -ENOSPC;
+	set_bit(bit, cm->alloc_map);
+	cm->allocated++;
+	m->total_allocated++;
+	return bit;
+}
+
+/**
+ * irq_matrix_assign - Assign a preallocated interrupt in the local CPU map
+ * @m:		Matrix pointer
+ * @bit:	Which bit to mark
+ *
+ * This should only be used to mark preallocated vectors
+ */
+void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+		return;
+	if (WARN_ON_ONCE(test_and_set_bit(bit, cm->alloc_map)))
+		return;
+	cm->allocated++;
+	m->total_allocated++;
+	cm->available--;
+	m->global_available--;
+}
+
+/**
+ * irq_matrix_reserve - Reserve interrupts
+ * @m:		Matrix pointer
+ *
+ * This is merily a book keeping call. It increments the number of globally
+ * reserved interrupt bits w/o actually allocating them. This allows to
+ * setup interrupt descriptors w/o assigning low level resources to it.
+ * The actual allocation happens when the interrupt gets activated.
+ */
+void irq_matrix_reserve(struct irq_matrix *m)
+{
+	if (m->global_reserved <= m->global_available &&
+	    m->global_reserved + 1 > m->global_available)
+		pr_warn("Interrupt reservation exceeds available resources\n");
+
+	m->global_reserved++;
+}
+
+/**
+ * irq_matrix_remove_reserved - Remove interrupt reservation
+ * @m:		Matrix pointer
+ *
+ * This is merily a book keeping call. It decrements the number of globally
+ * reserved interrupt bits. This is used to undo irq_matrix_reserve() when the
+ * interrupt was never in use and a real vector allocated, which undid the
+ * reservation.
+ */
+void irq_matrix_remove_reserved(struct irq_matrix *m)
+{
+	m->global_reserved--;
+}
+
+/**
+ * irq_matrix_alloc - Allocate a regular interrupt in a CPU map
+ * @m:		Matrix pointer
+ * @msk:	Which CPUs to search in
+ * @reserved:	Allocate previously reserved interrupts
+ * @mapped_cpu: Pointer to store the CPU for which the irq was allocated
+ */
+int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
+		     bool reserved, unsigned int *mapped_cpu)
+{
+	unsigned int cpu;
+
+	for_each_cpu(cpu, msk) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+		unsigned int bit;
+
+		if (!cm->online)
+			continue;
+
+		bit = matrix_alloc_area(m, cm, 1, false);
+		if (bit < m->alloc_end) {
+			cm->allocated++;
+			cm->available--;
+			m->total_allocated++;
+			m->global_available--;
+			if (reserved)
+				m->global_reserved--;
+			*mapped_cpu = cpu;
+			return bit;
+		}
+	}
+	return -ENOSPC;
+}
+
+/**
+ * irq_matrix_free - Free allocated interrupt in the matrix
+ * @m:		Matrix pointer
+ * @cpu:	Which CPU map needs be updated
+ * @bit:	The bit to remove
+ * @managed:	If true, the interrupt is managed and not accounted
+ *		as available.
+ */
+void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
+		     unsigned int bit, bool managed)
+{
+	struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+	if (WARN_ON_ONCE(bit < m->alloc_start || bit >= m->alloc_end))
+		return;
+
+	if (cm->online) {
+		clear_bit(bit, cm->alloc_map);
+		cm->allocated--;
+		m->total_allocated--;
+		if (!managed) {
+			cm->available++;
+			m->global_available++;
+		}
+	}
+}
+
+/**
+ * irq_matrix_available - Get the number of globally available irqs
+ * @m:		Pointer to the matrix to query
+ * @cpudown:	If true, the local CPU is about to go down, adjust
+ *		the number of available irqs accordingly
+ */
+unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	return m->global_available - cpudown ? cm->available : 0;
+}
+
+/**
+ * irq_matrix_reserved - Get the number of globally reserved irqs
+ * @m:		Pointer to the matrix to query
+ */
+unsigned int irq_matrix_reserved(struct irq_matrix *m)
+{
+	return m->global_reserved;
+}
+
+/**
+ * irq_matrix_allocated - Get the number of allocated irqs on the local cpu
+ * @m:		Pointer to the matrix to search
+ *
+ * This returns number of allocated irqs
+ */
+unsigned int irq_matrix_allocated(struct irq_matrix *m)
+{
+	struct cpumap *cm = this_cpu_ptr(m->maps);
+
+	return cm->allocated;
+}
+
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+/**
+ * irq_matrix_debug_show - Show detailed allocation information
+ * @sf:		Pointer to the seq_file to print to
+ * @m:		Pointer to the matrix allocator
+ * @ind:	Indentation for the print format
+ *
+ * Note, this is a lockless snapshot.
+ */
+void irq_matrix_debug_show(struct seq_file *sf, struct irq_matrix *m, int ind)
+{
+	unsigned int nsys = bitmap_weight(m->system_map, m->matrix_bits);
+	int cpu;
+
+	seq_printf(sf, "Online bitmaps:   %6u\n", m->online_maps);
+	seq_printf(sf, "Global available: %6u\n", m->global_available);
+	seq_printf(sf, "Global reserved:  %6u\n", m->global_reserved);
+	seq_printf(sf, "Total allocated:  %6u\n", m->total_allocated);
+	seq_printf(sf, "System: %u: %*pbl\n", nsys, m->matrix_bits,
+		   m->system_map);
+	seq_printf(sf, "%*s| CPU | avl | man | act | vectors\n", ind, " ");
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct cpumap *cm = per_cpu_ptr(m->maps, cpu);
+
+		seq_printf(sf, "%*s %4d  %4u  %4u  %4u  %*pbl\n", ind, " ",
+			   cpu, cm->available, cm->managed, cm->allocated,
+			   m->matrix_bits, cm->alloc_map);
+	}
+	cpus_read_unlock();
+}
+#endif
-- 
cgit v1.3-14-g43fede


From ec0f7cd273dc41ab28bba703cac82690ea5f2863 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 13 Sep 2017 23:29:15 +0200
Subject: genirq/matrix: Add tracepoints

Add tracepoints for the irq bitmap matrix allocator.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Yu Chen <yu.c.chen@intel.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rui Zhang <rui.zhang@intel.com>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Len Brown <lenb@kernel.org>
Link: https://lkml.kernel.org/r/20170913213153.279468022@linutronix.de
---
 include/trace/events/irq_matrix.h | 201 ++++++++++++++++++++++++++++++++++++++
 kernel/irq/matrix.c               |  15 +++
 2 files changed, 216 insertions(+)
 create mode 100644 include/trace/events/irq_matrix.h

(limited to 'kernel')

diff --git a/include/trace/events/irq_matrix.h b/include/trace/events/irq_matrix.h
new file mode 100644
index 000000000000..267d4cbbf360
--- /dev/null
+++ b/include/trace/events/irq_matrix.h
@@ -0,0 +1,201 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq_matrix
+
+#if !defined(_TRACE_IRQ_MATRIX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_MATRIX_H
+
+#include <linux/tracepoint.h>
+
+struct irq_matrix;
+struct cpumap;
+
+DECLARE_EVENT_CLASS(irq_matrix_global,
+
+	TP_PROTO(struct irq_matrix *matrix),
+
+	TP_ARGS(matrix),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	online_maps		)
+		__field(	unsigned int,	global_available	)
+		__field(	unsigned int,	global_reserved		)
+		__field(	unsigned int,	total_allocated		)
+	),
+
+	TP_fast_assign(
+		__entry->online_maps		= matrix->online_maps;
+		__entry->global_available	= matrix->global_available;
+		__entry->global_reserved	= matrix->global_reserved;
+		__entry->total_allocated	= matrix->total_allocated;
+	),
+
+	TP_printk("online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u",
+		  __entry->online_maps, __entry->global_available,
+		  __entry->global_reserved, __entry->total_allocated)
+);
+
+DECLARE_EVENT_CLASS(irq_matrix_global_update,
+
+	TP_PROTO(int bit, struct irq_matrix *matrix),
+
+	TP_ARGS(bit, matrix),
+
+	TP_STRUCT__entry(
+		__field(	int,		bit			)
+		__field(	unsigned int,	online_maps		)
+		__field(	unsigned int,	global_available	)
+		__field(	unsigned int,	global_reserved		)
+		__field(	unsigned int,	total_allocated		)
+	),
+
+	TP_fast_assign(
+		__entry->bit			= bit;
+		__entry->online_maps		= matrix->online_maps;
+		__entry->global_available	= matrix->global_available;
+		__entry->global_reserved	= matrix->global_reserved;
+		__entry->total_allocated	= matrix->total_allocated;
+	),
+
+	TP_printk("bit=%d online_maps=%d global_avl=%u, global_rsvd=%u, total_alloc=%u",
+		  __entry->bit, __entry->online_maps,
+		  __entry->global_available, __entry->global_reserved,
+		  __entry->total_allocated)
+);
+
+DECLARE_EVENT_CLASS(irq_matrix_cpu,
+
+	TP_PROTO(int bit, unsigned int cpu, struct irq_matrix *matrix,
+		 struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap),
+
+	TP_STRUCT__entry(
+		__field(	int,		bit			)
+		__field(	unsigned int,	cpu			)
+		__field(	bool,		online			)
+		__field(	unsigned int,	available		)
+		__field(	unsigned int,	allocated		)
+		__field(	unsigned int,	managed			)
+		__field(	unsigned int,	online_maps		)
+		__field(	unsigned int,	global_available	)
+		__field(	unsigned int,	global_reserved		)
+		__field(	unsigned int,	total_allocated		)
+	),
+
+	TP_fast_assign(
+		__entry->bit			= bit;
+		__entry->cpu			= cpu;
+		__entry->online			= cmap->online;
+		__entry->available		= cmap->available;
+		__entry->allocated		= cmap->allocated;
+		__entry->managed		= cmap->managed;
+		__entry->online_maps		= matrix->online_maps;
+		__entry->global_available	= matrix->global_available;
+		__entry->global_reserved	= matrix->global_reserved;
+		__entry->total_allocated	= matrix->total_allocated;
+	),
+
+	TP_printk("bit=%d cpu=%u online=%d avl=%u alloc=%u managed=%u online_maps=%u global_avl=%u, global_rsvd=%u, total_alloc=%u",
+		  __entry->bit, __entry->cpu, __entry->online,
+		  __entry->available, __entry->allocated,
+		  __entry->managed, __entry->online_maps,
+		  __entry->global_available, __entry->global_reserved,
+		  __entry->total_allocated)
+);
+
+DEFINE_EVENT(irq_matrix_global, irq_matrix_online,
+
+	TP_PROTO(struct irq_matrix *matrix),
+
+	TP_ARGS(matrix)
+);
+
+DEFINE_EVENT(irq_matrix_global, irq_matrix_offline,
+
+	TP_PROTO(struct irq_matrix *matrix),
+
+	TP_ARGS(matrix)
+);
+
+DEFINE_EVENT(irq_matrix_global, irq_matrix_reserve,
+
+	TP_PROTO(struct irq_matrix *matrix),
+
+	TP_ARGS(matrix)
+);
+
+DEFINE_EVENT(irq_matrix_global, irq_matrix_remove_reserved,
+
+	TP_PROTO(struct irq_matrix *matrix),
+
+	TP_ARGS(matrix)
+);
+
+DEFINE_EVENT(irq_matrix_global_update, irq_matrix_assign_system,
+
+	TP_PROTO(int bit, struct irq_matrix *matrix),
+
+	TP_ARGS(bit, matrix)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_reserved,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_reserve_managed,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_remove_managed,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc_managed,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_assign,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_alloc,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+DEFINE_EVENT(irq_matrix_cpu, irq_matrix_free,
+
+	TP_PROTO(int bit, unsigned int cpu,
+		 struct irq_matrix *matrix, struct cpumap *cmap),
+
+	TP_ARGS(bit, cpu, matrix, cmap)
+);
+
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index 7b2b4fbde1e2..a3cbbc8191c5 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -36,6 +36,9 @@ struct irq_matrix {
 	unsigned long		system_map[IRQ_MATRIX_SIZE];
 };
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq_matrix.h>
+
 /**
  * irq_alloc_matrix - Allocate a irq_matrix structure and initialize it
  * @matrix_bits:	Number of matrix bits must be <= IRQ_MATRIX_BITS
@@ -84,6 +87,7 @@ void irq_matrix_online(struct irq_matrix *m)
 	m->global_available += cm->available;
 	cm->online = true;
 	m->online_maps++;
+	trace_irq_matrix_online(m);
 }
 
 /**
@@ -98,6 +102,7 @@ void irq_matrix_offline(struct irq_matrix *m)
 	m->global_available -= cm->available;
 	cm->online = false;
 	m->online_maps--;
+	trace_irq_matrix_offline(m);
 }
 
 static unsigned int matrix_alloc_area(struct irq_matrix *m, struct cpumap *cm,
@@ -145,6 +150,8 @@ void irq_matrix_assign_system(struct irq_matrix *m, unsigned int bit,
 	}
 	if (bit >= m->alloc_start && bit < m->alloc_end)
 		m->systembits_inalloc++;
+
+	trace_irq_matrix_assign_system(bit, m);
 }
 
 /**
@@ -172,6 +179,7 @@ int irq_matrix_reserve_managed(struct irq_matrix *m, const struct cpumask *msk)
 			cm->available--;
 			m->global_available--;
 		}
+		trace_irq_matrix_reserve_managed(bit, cpu, m, cm);
 	}
 	return 0;
 cleanup:
@@ -221,6 +229,7 @@ void irq_matrix_remove_managed(struct irq_matrix *m, const struct cpumask *msk)
 			cm->available++;
 			m->global_available++;
 		}
+		trace_irq_matrix_remove_managed(bit, cpu, m, cm);
 	}
 }
 
@@ -242,6 +251,7 @@ int irq_matrix_alloc_managed(struct irq_matrix *m, unsigned int cpu)
 	set_bit(bit, cm->alloc_map);
 	cm->allocated++;
 	m->total_allocated++;
+	trace_irq_matrix_alloc_managed(bit, cpu, m, cm);
 	return bit;
 }
 
@@ -264,6 +274,7 @@ void irq_matrix_assign(struct irq_matrix *m, unsigned int bit)
 	m->total_allocated++;
 	cm->available--;
 	m->global_available--;
+	trace_irq_matrix_assign(bit, smp_processor_id(), m, cm);
 }
 
 /**
@@ -282,6 +293,7 @@ void irq_matrix_reserve(struct irq_matrix *m)
 		pr_warn("Interrupt reservation exceeds available resources\n");
 
 	m->global_reserved++;
+	trace_irq_matrix_reserve(m);
 }
 
 /**
@@ -296,6 +308,7 @@ void irq_matrix_reserve(struct irq_matrix *m)
 void irq_matrix_remove_reserved(struct irq_matrix *m)
 {
 	m->global_reserved--;
+	trace_irq_matrix_remove_reserved(m);
 }
 
 /**
@@ -326,6 +339,7 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk,
 			if (reserved)
 				m->global_reserved--;
 			*mapped_cpu = cpu;
+			trace_irq_matrix_alloc(bit, cpu, m, cm);
 			return bit;
 		}
 	}
@@ -357,6 +371,7 @@ void irq_matrix_free(struct irq_matrix *m, unsigned int cpu,
 			m->global_available++;
 		}
 	}
+	trace_irq_matrix_free(bit, cpu, m, cm);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 5df32107f609c1f621bcdac0a685c23677ef671e Mon Sep 17 00:00:00 2001
From: Prarit Bhargava <prarit@redhat.com>
Date: Mon, 28 Aug 2017 08:21:53 -0400
Subject: timekeeping: Make fast accessors return 0 before timekeeping is
 initialized

printk timestamps will be extended to include mono and boot time by using
the fast timekeeping accessors ktime_get_mono|boot_fast_ns().  The
functions can return garbage before timekeeping is initialized resulting in
garbage timestamps.

Initialize the fast timekeepers with dummy clocks which guarantee a 0
readout up to timekeeping_init().

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1503922914-10660-2-git-send-email-prarit@redhat.com
---
 kernel/time/timekeeping.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..6a92794427c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -60,8 +60,27 @@ struct tk_fast {
 	struct tk_read_base	base[2];
 };
 
-static struct tk_fast tk_fast_mono ____cacheline_aligned;
-static struct tk_fast tk_fast_raw  ____cacheline_aligned;
+/* Suspend-time cycles value for halted fast timekeeper. */
+static u64 cycles_at_suspend;
+
+static u64 dummy_clock_read(struct clocksource *cs)
+{
+	return cycles_at_suspend;
+}
+
+static struct clocksource dummy_clock = {
+	.read = dummy_clock_read,
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned = {
+	.base[0] = { .clock = &dummy_clock, },
+	.base[1] = { .clock = &dummy_clock, },
+};
+
+static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
+	.base[0] = { .clock = &dummy_clock, },
+	.base[1] = { .clock = &dummy_clock, },
+};
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -477,18 +496,6 @@ u64 notrace ktime_get_boot_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
-/* Suspend-time cycles value for halted fast timekeeper. */
-static u64 cycles_at_suspend;
-
-static u64 dummy_clock_read(struct clocksource *cs)
-{
-	return cycles_at_suspend;
-}
-
-static struct clocksource dummy_clock = {
-	.read = dummy_clock_read,
-};
-
 /**
  * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
  * @tk: Timekeeper to snapshot.
-- 
cgit v1.3-14-g43fede


From 4c3711d7fb4763c63b2654f2d07cbe21ca5aadd4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 31 Aug 2017 17:12:48 +0200
Subject: timekeeping: Provide NMI safe access to clock realtime

The configurable printk timestamping wants access to clock realtime. Right
now there is no ktime_get_real_fast_ns() accessor because reading the
monotonic base and the realtime offset cannot be done atomically. Contrary
to boot time this offset can change during runtime and cause half updated
readouts.

struct tk_read_base was fully packed when the fast timekeeper access was
implemented. commit ceea5e3771ed ("time: Fix clock->read(clock) race around
clocksource changes") removed the 'read' function pointer from the
structure, but of course left the comment stale.

So now the structure can fit a new 64bit member w/o violating the cache
line constraints.

Add real_base to tk_read_base and update it in the fast timekeeper update
sequence.

Implement an accessor which follows the same scheme as the accessor to
clock monotonic, but uses the new real_base to access clock real time.

The runtime overhead for updating real_base is minimal as it just adds two
cache hot values and stores them into an already dirtied cache line along
with the other fast timekeeper updates.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead,org>
Link: https://lkml.kernel.org/r/1505757060-2004-3-git-send-email-prarit@redhat.com
---
 include/linux/timekeeper_internal.h |  6 +++++-
 include/linux/timekeeping.h         |  1 +
 kernel/time/timekeeping.c           | 35 +++++++++++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 0a0a53daf2a2..98f645ee8409 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -13,19 +13,22 @@
 /**
  * struct tk_read_base - base structure for timekeeping readout
  * @clock:	Current clocksource used for timekeeping.
- * @read:	Read function of @clock
  * @mask:	Bitmask for two's complement subtraction of non 64bit clocks
  * @cycle_last: @clock cycle value at last update
  * @mult:	(NTP adjusted) multiplier for scaled math conversion
  * @shift:	Shift value for scaled math conversion
  * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
  * @base:	ktime_t (nanoseconds) base time for readout
+ * @base_real:	Nanoseconds base value for clock REALTIME readout
  *
  * This struct has size 56 byte on 64 bit. Together with a seqcount it
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
  * for a fast NMI safe accessors.
+ *
+ * @base_real is for the fast NMI safe accessor to allow reading clock
+ * realtime from any context.
  */
 struct tk_read_base {
 	struct clocksource	*clock;
@@ -35,6 +38,7 @@ struct tk_read_base {
 	u32			shift;
 	u64			xtime_nsec;
 	ktime_t			base;
+	u64			base_real;
 };
 
 /**
diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index ddc229ff6d1e..eb98cbdbb323 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -239,6 +239,7 @@ static inline u64 ktime_get_raw_ns(void)
 extern u64 ktime_get_mono_fast_ns(void);
 extern u64 ktime_get_raw_fast_ns(void);
 extern u64 ktime_get_boot_fast_ns(void);
+extern u64 ktime_get_real_fast_ns(void);
 
 /*
  * Timespec interfaces utilizing the ktime based ones
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6a92794427c9..8af77006e937 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -496,6 +496,39 @@ u64 notrace ktime_get_boot_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
+
+/*
+ * See comment for __ktime_get_fast_ns() vs. timestamp ordering
+ */
+static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
+{
+	struct tk_read_base *tkr;
+	unsigned int seq;
+	u64 now;
+
+	do {
+		seq = raw_read_seqcount_latch(&tkf->seq);
+		tkr = tkf->base + (seq & 0x01);
+		now = ktime_to_ns(tkr->base_real);
+
+		now += timekeeping_delta_to_ns(tkr,
+				clocksource_delta(
+					tk_clock_read(tkr),
+					tkr->cycle_last,
+					tkr->mask));
+	} while (read_seqcount_retry(&tkf->seq, seq));
+
+	return now;
+}
+
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ */
+u64 ktime_get_real_fast_ns(void)
+{
+	return __ktime_get_real_fast_ns(&tk_fast_mono);
+}
+
 /**
  * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
  * @tk: Timekeeper to snapshot.
@@ -514,6 +547,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tk_clock_read(tkr);
 	tkr_dummy.clock = &dummy_clock;
+	tkr_dummy.base_real = tkr->base + tk->offs_real;
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
 
 	tkr = &tk->tkr_raw;
@@ -661,6 +695,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
+	tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 
-- 
cgit v1.3-14-g43fede


From 38683148828165ea0b66ace93a9fedc2d3281e27 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 13:50:20 -0700
Subject: cgroup: statically initialize init_css_set->dfl_cgrp
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Like other csets, init_css_set's dfl_cgrp is initialized when the cset
gets linked.  init_css_set gets linked in cgroup_init().  This has
been fine till now but the recently added basic CPU usage accounting
may end up accessing dfl_cgrp of init before cgroup_init() leading to
the following oops.

  SELinux:  Initializing.
  BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0
  IP: account_system_index_time+0x60/0x90
  PGD 0 P4D 0
  Oops: 0000 [#1] SMP
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.14.0-rc2-00003-g041cd64 #10
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  +1.9.3-20161025_171302-gandalf 04/01/2014
  task: ffffffff81e10480 task.stack: ffffffff81e00000
  RIP: 0010:account_system_index_time+0x60/0x90
  RSP: 0000:ffff880011e03cb8 EFLAGS: 00010002
  RAX: ffffffff81ef8800 RBX: ffffffff81e10480 RCX: 0000000000000003
  RDX: 0000000000000000 RSI: 00000000000f4240 RDI: 0000000000000000
  RBP: ffff880011e03cc0 R08: 0000000000010000 R09: 0000000000000000
  R10: 0000000000000020 R11: 0000003b9aca0000 R12: 000000000001c100
  R13: 0000000000000000 R14: ffffffff81e10480 R15: ffffffff81e03cd8
  FS:  0000000000000000(0000) GS:ffff880011e00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000000000b0 CR3: 0000000001e09000 CR4: 00000000000006b0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   <IRQ>
   account_system_time+0x45/0x60
   account_process_tick+0x5a/0x140
   update_process_times+0x22/0x60
   tick_periodic+0x2b/0x90
   tick_handle_periodic+0x25/0x70
   timer_interrupt+0x15/0x20
   __handle_irq_event_percpu+0x7e/0x1b0
   handle_irq_event_percpu+0x23/0x60
   handle_irq_event+0x42/0x70
   handle_level_irq+0x83/0x100
   handle_irq+0x6f/0x110
   do_IRQ+0x46/0xd0
   common_interrupt+0x9d/0x9d

Fix it by statically initializing init_css_set.dfl_cgrp so that init's
default cgroup is accessible from the get-go.

Fixes: 041cd640b2f3 ("cgroup: Implement cgroup2 basic CPU usage accounting")
Reported-by: “kbuild-all@01.org” <kbuild-all@01.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d036625556c9..7975b20f1fd1 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -649,6 +649,14 @@ struct css_set init_css_set = {
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+
+	/*
+	 * The following field is re-initialized when this cset gets linked
+	 * in cgroup_init().  However, let's initialize the field
+	 * statically too so that the default cgroup can be accessed safely
+	 * early during boot.
+	 */
+	.dfl_cgrp		= &cgrp_dfl_root.cgrp,
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
-- 
cgit v1.3-14-g43fede


From 8157a7faf94135386bf04b1cf94e6efd3fb94702 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 14:27:54 -0700
Subject: sched/cputime: Add dummy cputime_adjust() implementation for
 CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

cfb766da54d9 ("sched/cputime: Expose cputime_adjust()") made
cputime_adjust() public for cgroup basic cpu stat support; however,
the commit forgot to add a dummy implementaiton for
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE leading to compiler errors on some
s390 configurations.

Fix it by adding the missing dummy implementation.

Reported-by: “kbuild-all@01.org” <kbuild-all@01.org>
Fixes: cfb766da54d9 ("sched/cputime: Expose cputime_adjust()")
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/cputime.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e01b699bbd5b..5498f20d2475 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -447,6 +447,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
+{
+	*ut = curr->utime;
+	*st = curr->stime;
+}
+
 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	*ut = p->utime;
-- 
cgit v1.3-14-g43fede


From b5d7388f9db78f19e6af7b56a469ca8d1860329d Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 21 Sep 2017 18:43:29 -0400
Subject: bpf: Optimize lpm trie delete

Before the delete operator was added, this datastructure maintained
an invariant that intermediate nodes were only present when necessary
to build the tree.  This patch updates the delete operation to reinstate
that invariant by removing unnecessary intermediate nodes after a node is
removed and thus keeping the tree structure at a minimal size.

Suggested-by: Daniel Mack <daniel@zonque.org>
Signed-off-by: Craig Gallek <kraig@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/lpm_trie.c | 71 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 43 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 9d58a576b2ae..34d8a690ea05 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -394,8 +394,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
 	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
 	struct bpf_lpm_trie_key *key = _key;
-	struct lpm_trie_node __rcu **trim;
-	struct lpm_trie_node *node;
+	struct lpm_trie_node __rcu **trim, **trim2;
+	struct lpm_trie_node *node, *parent;
 	unsigned long irq_flags;
 	unsigned int next_bit;
 	size_t matchlen = 0;
@@ -407,31 +407,26 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 	raw_spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Walk the tree looking for an exact key/length match and keeping
-	 * track of where we could begin trimming the tree.  The trim-point
-	 * is the sub-tree along the walk consisting of only single-child
-	 * intermediate nodes and ending at a leaf node that we want to
-	 * remove.
+	 * track of the path we traverse.  We will need to know the node
+	 * we wish to delete, and the slot that points to the node we want
+	 * to delete.  We may also need to know the nodes parent and the
+	 * slot that contains it.
 	 */
 	trim = &trie->root;
-	node = rcu_dereference_protected(
-		trie->root, lockdep_is_held(&trie->lock));
-	while (node) {
+	trim2 = trim;
+	parent = NULL;
+	while ((node = rcu_dereference_protected(
+		       *trim, lockdep_is_held(&trie->lock)))) {
 		matchlen = longest_prefix_match(trie, node, key);
 
 		if (node->prefixlen != matchlen ||
 		    node->prefixlen == key->prefixlen)
 			break;
 
+		parent = node;
+		trim2 = trim;
 		next_bit = extract_bit(key->data, node->prefixlen);
-		/* If we hit a node that has more than one child or is a valid
-		 * prefix itself, do not remove it. Reset the root of the trim
-		 * path to its descendant on our path.
-		 */
-		if (!(node->flags & LPM_TREE_NODE_FLAG_IM) ||
-		    (node->child[0] && node->child[1]))
-			trim = &node->child[next_bit];
-		node = rcu_dereference_protected(
-			node->child[next_bit], lockdep_is_held(&trie->lock));
+		trim = &node->child[next_bit];
 	}
 
 	if (!node || node->prefixlen != key->prefixlen ||
@@ -442,27 +437,47 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 
 	trie->n_entries--;
 
-	/* If the node we are removing is not a leaf node, simply mark it
+	/* If the node we are removing has two children, simply mark it
 	 * as intermediate and we are done.
 	 */
-	if (rcu_access_pointer(node->child[0]) ||
+	if (rcu_access_pointer(node->child[0]) &&
 	    rcu_access_pointer(node->child[1])) {
 		node->flags |= LPM_TREE_NODE_FLAG_IM;
 		goto out;
 	}
 
-	/* trim should now point to the slot holding the start of a path from
-	 * zero or more intermediate nodes to our leaf node for deletion.
+	/* If the parent of the node we are about to delete is an intermediate
+	 * node, and the deleted node doesn't have any children, we can delete
+	 * the intermediate parent as well and promote its other child
+	 * up the tree.  Doing this maintains the invariant that all
+	 * intermediate nodes have exactly 2 children and that there are no
+	 * unnecessary intermediate nodes in the tree.
 	 */
-	while ((node = rcu_dereference_protected(
-			*trim, lockdep_is_held(&trie->lock)))) {
-		RCU_INIT_POINTER(*trim, NULL);
-		trim = rcu_access_pointer(node->child[0]) ?
-			&node->child[0] :
-			&node->child[1];
+	if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+	    !node->child[0] && !node->child[1]) {
+		if (node == rcu_access_pointer(parent->child[0]))
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[1]));
+		else
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[0]));
+		kfree_rcu(parent, rcu);
 		kfree_rcu(node, rcu);
+		goto out;
 	}
 
+	/* The node we are removing has either zero or one child. If there
+	 * is a child, move it into the removed node's slot then delete
+	 * the node.  Otherwise just clear the slot and delete the node.
+	 */
+	if (node->child[0])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+	else if (node->child[1])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+	else
+		RCU_INIT_POINTER(*trim, NULL);
+	kfree_rcu(node, rcu);
+
 out:
 	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
 
-- 
cgit v1.3-14-g43fede


From 05e3db95ebfc5c06a29a1d8c7a3e02f46f3a25a7 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 14 Sep 2017 14:02:04 -0700
Subject: kthread: add a mechanism to store cgroup info

kthread usually runs jobs on behalf of other threads. The jobs should be
charged to cgroup of original threads. But the jobs run in a kthread,
where we lose the cgroup context of original threads. The patch adds a
machanism to record cgroup info of original threads in kthread context.
Later we can retrieve the cgroup info and attach the cgroup info to jobs.

Since this mechanism is only required by kthread, we store the cgroup
info in kthread data instead of generic task_struct.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/kthread.h | 11 +++++++++
 kernel/kthread.c        | 66 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 82e197eeac91..bd4369c83dfb 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -3,6 +3,7 @@
 /* Simple interface for creating and stopping kernel threads without mess. */
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/cgroup.h>
 
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
@@ -198,4 +199,14 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+#ifdef CONFIG_CGROUPS
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	return NULL;
+}
+#endif
 #endif /* _LINUX_KTHREAD_H */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..b011ea08967f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
-#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
+#ifdef CONFIG_CGROUPS
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+	struct kthread *kthread;
+
 	/*
 	 * Can be NULL if this kthread was created by kernel_thread()
 	 * or if kmalloc() in kthread() failed.
 	 */
-	kfree(to_kthread(k));
+	kthread = to_kthread(k);
+#ifdef CONFIG_CGROUPS
+	WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+	kfree(kthread);
 }
 
 /**
@@ -216,6 +224,9 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
+#ifdef CONFIG_CGROUPS
+	self->blkcg_css = NULL;
+#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -1154,3 +1165,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 	kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_CGROUPS
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+	struct kthread *kthread;
+
+	if (!(current->flags & PF_KTHREAD))
+		return;
+	kthread = to_kthread(current);
+	if (!kthread)
+		return;
+
+	if (kthread->blkcg_css) {
+		css_put(kthread->blkcg_css);
+		kthread->blkcg_css = NULL;
+	}
+	if (css) {
+		css_get(css);
+		kthread->blkcg_css = css;
+	}
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	struct kthread *kthread;
+
+	if (current->flags & PF_KTHREAD) {
+		kthread = to_kthread(current);
+		if (kthread)
+			return kthread->blkcg_css;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
-- 
cgit v1.3-14-g43fede


From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 26 Sep 2017 11:02:12 -0700
Subject: block: fix a build error

The code is only for blkcg not for all cgroups

Fixes: d4478e92d618 ("block/loop: make loop cgroup aware")
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c    | 2 +-
 include/linux/kthread.h | 2 +-
 kernel/kthread.c        | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index fd4eff5f5b76..bc8e61506968 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	/* always use the first bio's css */
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) {
 		cmd->css = cmd->rq->bio->bi_css;
 		css_get(cmd->css);
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index bd4369c83dfb..fb201842c635 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,7 +199,7 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 void kthread_associate_blkcg(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *kthread_blkcg(void);
 #else
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b011ea08967f..f87cd8b4eb2a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -46,7 +46,7 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	struct cgroup_subsys_state *blkcg_css;
 #endif
 };
@@ -83,7 +83,7 @@ void free_kthread_struct(struct task_struct *k)
 	 * or if kmalloc() in kthread() failed.
 	 */
 	kthread = to_kthread(k);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	WARN_ON_ONCE(kthread && kthread->blkcg_css);
 #endif
 	kfree(kthread);
@@ -224,7 +224,7 @@ static int kthread(void *_create)
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 	self->blkcg_css = NULL;
 #endif
 	current->vfork_done = &self->exited;
@@ -1166,7 +1166,7 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_BLK_CGROUP
 /**
  * kthread_associate_blkcg - associate blkcg to current kthread
  * @css: the cgroup info
-- 
cgit v1.3-14-g43fede


From 6aaae2b6c4330a46204bca042f1d2f41e8e18dea Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:50 +0200
Subject: bpf: rename bpf_compute_data_end into bpf_compute_data_pointers

Just do the rename into bpf_compute_data_pointers() as we'll add
one more pointer here to recompute.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h |  9 ++++++---
 kernel/bpf/sockmap.c   |  4 ++--
 net/bpf/test_run.c     |  2 +-
 net/core/filter.c      | 14 +++++++-------
 net/core/lwt_bpf.c     |  2 +-
 net/sched/act_bpf.c    |  4 ++--
 net/sched/cls_bpf.c    |  4 ++--
 7 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d29e58fde364..052bab3d62e7 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -496,10 +496,13 @@ struct xdp_buff {
 	void *data_hard_start;
 };
 
-/* compute the linear packet data range [data, data_end) which
- * will be accessed by cls_bpf, act_bpf and lwt programs
+/* Compute the linear packet data range [data, data_end) which
+ * will be accessed by various program types (cls_bpf, act_bpf,
+ * lwt, ...). Subsystems allowing direct data access must (!)
+ * ensure that cb[] area can be written to when BPF program is
+ * invoked (otherwise cb[] save/restore is necessary).
  */
-static inline void bpf_compute_data_end(struct sk_buff *skb)
+static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 {
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 
 	skb_orphan(skb);
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 6be41a44d688..df672517b4fd 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -133,7 +133,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	if (is_l2)
 		__skb_push(skb, ETH_HLEN);
 	if (is_direct_pkt_access)
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 	retval = bpf_test_run(prog, skb, repeat, &duration);
 	if (!is_l2)
 		__skb_push(skb, ETH_HLEN);
diff --git a/net/core/filter.c b/net/core/filter.c
index 82edad58d066..c468e7cfad19 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1402,7 +1402,7 @@ static inline int bpf_try_make_writable(struct sk_buff *skb,
 {
 	int err = __bpf_try_make_writable(skb, write_len);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return err;
 }
 
@@ -1962,7 +1962,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
 	ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -1984,7 +1984,7 @@ BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
 	ret = skb_vlan_pop(skb);
 	bpf_pull_mac_rcsum(skb);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2178,7 +2178,7 @@ BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
 	 * need to be verified first.
 	 */
 	ret = bpf_skb_proto_xlat(skb, proto);
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2303,7 +2303,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
 	ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
 		       bpf_skb_net_grow(skb, len_diff_abs);
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2394,7 +2394,7 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
 			skb_gso_reset(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return ret;
 }
 
@@ -2434,7 +2434,7 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
 		skb_reset_mac_header(skb);
 	}
 
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	return 0;
 }
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 1307731ddfe4..e7e626fb87bb 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -51,7 +51,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 */
 	preempt_disable();
 	rcu_read_lock();
-	bpf_compute_data_end(skb);
+	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 	rcu_read_unlock();
 
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index c0c707eb2c96..5ef8ce8c83d4 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -49,11 +49,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
 	filter = rcu_dereference(prog->filter);
 	if (at_ingress) {
 		__skb_push(skb, skb->mac_len);
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 		__skb_pull(skb, skb->mac_len);
 	} else {
-		bpf_compute_data_end(skb);
+		bpf_compute_data_pointers(skb);
 		filter_res = BPF_PROG_RUN(filter, skb);
 	}
 	rcu_read_unlock();
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 520c5027646a..36671b0fb125 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -99,11 +99,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
 		} else if (at_ingress) {
 			/* It is safe to push/pull even if skb_shared() */
 			__skb_push(skb, skb->mac_len);
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 			__skb_pull(skb, skb->mac_len);
 		} else {
-			bpf_compute_data_end(skb);
+			bpf_compute_data_pointers(skb);
 			filter_res = BPF_PROG_RUN(prog->filter, skb);
 		}
 
-- 
cgit v1.3-14-g43fede


From de8f3a83b0a0fddb2cf56e7a718127e9619ea3da Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Mon, 25 Sep 2017 02:25:51 +0200
Subject: bpf: add meta pointer for direct access

This work enables generic transfer of metadata from XDP into skb. The
basic idea is that we can make use of the fact that the resulting skb
must be linear and already comes with a larger headroom for supporting
bpf_xdp_adjust_head(), which mangles xdp->data. Here, we base our work
on a similar principle and introduce a small helper bpf_xdp_adjust_meta()
for adjusting a new pointer called xdp->data_meta. Thus, the packet has
a flexible and programmable room for meta data, followed by the actual
packet data. struct xdp_buff is therefore laid out that we first point
to data_hard_start, then data_meta directly prepended to data followed
by data_end marking the end of packet. bpf_xdp_adjust_head() takes into
account whether we have meta data already prepended and if so, memmove()s
this along with the given offset provided there's enough room.

xdp->data_meta is optional and programs are not required to use it. The
rationale is that when we process the packet in XDP (e.g. as DoS filter),
we can push further meta data along with it for the XDP_PASS case, and
give the guarantee that a clsact ingress BPF program on the same device
can pick this up for further post-processing. Since we work with skb
there, we can also set skb->mark, skb->priority or other skb meta data
out of BPF, thus having this scratch space generic and programmable
allows for more flexibility than defining a direct 1:1 transfer of
potentially new XDP members into skb (it's also more efficient as we
don't need to initialize/handle each of such new members). The facility
also works together with GRO aggregation. The scratch space at the head
of the packet can be multiple of 4 byte up to 32 byte large. Drivers not
yet supporting xdp->data_meta can simply be set up with xdp->data_meta
as xdp->data + 1 as bpf_xdp_adjust_meta() will detect this and bail out,
such that the subsequent match against xdp->data for later access is
guaranteed to fail.

The verifier treats xdp->data_meta/xdp->data the same way as we treat
xdp->data/xdp->data_end pointer comparisons. The requirement for doing
the compare against xdp->data is that it hasn't been modified from it's
original address we got from ctx access. It may have a range marking
already from prior successful xdp->data/xdp->data_end pointer comparisons
though.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c      |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   |   1 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c        |   1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |   1 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c    |   1 +
 .../net/ethernet/netronome/nfp/nfp_net_common.c    |   1 +
 drivers/net/ethernet/qlogic/qede/qede_fp.c         |   1 +
 drivers/net/tun.c                                  |   1 +
 drivers/net/virtio_net.c                           |   2 +
 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |  21 +++-
 include/linux/skbuff.h                             |  68 +++++++++++-
 include/uapi/linux/bpf.h                           |  13 ++-
 kernel/bpf/verifier.c                              | 114 ++++++++++++++++-----
 net/bpf/test_run.c                                 |   1 +
 net/core/dev.c                                     |  31 +++++-
 net/core/filter.c                                  |  77 +++++++++++++-
 net/core/skbuff.c                                  |   2 +
 19 files changed, 297 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index d8f0c837b72c..06ce63c00821 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -94,6 +94,7 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 
 	xdp.data_hard_start = *data_ptr - offset;
 	xdp.data = *data_ptr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = *data_ptr + *len;
 	orig_data = xdp.data;
 	mapping = rx_buf->mapping - bp->rx_dma_offset;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 49b80da51ba7..d68478afccbf 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -523,6 +523,7 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 
 	xdp.data_hard_start = page_address(page);
 	xdp.data = (void *)cpu_addr;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + len;
 	orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 1519dfb851d0..f426762bd83a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2107,6 +2107,7 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      i40e_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index d962368d08d0..04bb03bda1cd 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2326,6 +2326,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		if (!skb) {
 			xdp.data = page_address(rx_buffer->page) +
 				   rx_buffer->page_offset;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_hard_start = xdp.data -
 					      ixgbe_rx_offset(rx_ring);
 			xdp.data_end = xdp.data + size;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..8f9cb8abc497 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -762,6 +762,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			xdp.data_hard_start = va - frags[0].page_offset;
 			xdp.data = va;
+			xdp_set_data_meta_invalid(&xdp);
 			xdp.data_end = xdp.data + length;
 			orig_data = xdp.data;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f1dd638384d3..30b3f3fbd719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -794,6 +794,7 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		return false;
 
 	xdp.data = va + *rx_headroom;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 	xdp.data_hard_start = va;
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1c0187f0af51..e3a38be3600a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1583,6 +1583,7 @@ static int nfp_net_run_xdp(struct bpf_prog *prog, void *data, void *hard_start,
 
 	xdp.data_hard_start = hard_start;
 	xdp.data = data + *off;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = data + *off + *len;
 
 	orig_data = xdp.data;
diff --git a/drivers/net/ethernet/qlogic/qede/qede_fp.c b/drivers/net/ethernet/qlogic/qede/qede_fp.c
index 6fc854b120b0..48ec4c56cddf 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_fp.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_fp.c
@@ -1004,6 +1004,7 @@ static bool qede_rx_xdp(struct qede_dev *edev,
 
 	xdp.data_hard_start = page_address(bd->data);
 	xdp.data = xdp.data_hard_start + *data_offset;
+	xdp_set_data_meta_invalid(&xdp);
 	xdp.data_end = xdp.data + *len;
 
 	/* Queues always have a full reset currently, so for the time
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2c36f6ebad79..a6e0bffe3d29 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1468,6 +1468,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 
 		xdp.data_hard_start = buf;
 		xdp.data = buf + pad;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index dd14a4547932..fc059f193e7d 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -554,6 +554,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
 		xdp.data = xdp.data_hard_start + xdp_headroom;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + len;
 		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
@@ -686,6 +687,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		data = page_address(xdp_page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
+		xdp_set_data_meta_invalid(&xdp);
 		xdp.data_end = xdp.data + (len - vi->hdr_len);
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8390859e79e7..2b672c50f160 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -137,6 +137,7 @@ enum bpf_reg_type {
 	PTR_TO_MAP_VALUE,	 /* reg points to map element value */
 	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
 	PTR_TO_STACK,		 /* reg == frame_pointer + offset */
+	PTR_TO_PACKET_META,	 /* skb->data - meta_len */
 	PTR_TO_PACKET,		 /* reg points to skb->data */
 	PTR_TO_PACKET_END,	 /* skb->data + headlen */
 };
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 052bab3d62e7..911d454af107 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -487,12 +487,14 @@ struct sk_filter {
 
 struct bpf_skb_data_end {
 	struct qdisc_skb_cb qdisc_cb;
+	void *data_meta;
 	void *data_end;
 };
 
 struct xdp_buff {
 	void *data;
 	void *data_end;
+	void *data_meta;
 	void *data_hard_start;
 };
 
@@ -507,7 +509,8 @@ static inline void bpf_compute_data_pointers(struct sk_buff *skb)
 	struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;
 
 	BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb));
-	cb->data_end = skb->data + skb_headlen(skb);
+	cb->data_meta = skb->data - skb_metadata_len(skb);
+	cb->data_end  = skb->data + skb_headlen(skb);
 }
 
 static inline u8 *bpf_skb_cb(struct sk_buff *skb)
@@ -728,8 +731,22 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
+
 void bpf_warn_invalid_xdp_action(u32 act);
-void bpf_warn_invalid_xdp_redirect(u32 ifindex);
 
 struct sock *do_sk_redirect_map(void);
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f9db5539a6fb..19e64bfb1a66 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -489,8 +489,9 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
  * the end of the header data, ie. at skb->end.
  */
 struct skb_shared_info {
-	unsigned short	_unused;
-	unsigned char	nr_frags;
+	__u8		__unused;
+	__u8		meta_len;
+	__u8		nr_frags;
 	__u8		tx_flags;
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
@@ -3400,6 +3401,69 @@ static inline ktime_t net_invalid_timestamp(void)
 	return 0;
 }
 
+static inline u8 skb_metadata_len(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->meta_len;
+}
+
+static inline void *skb_metadata_end(const struct sk_buff *skb)
+{
+	return skb_mac_header(skb);
+}
+
+static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
+					  const struct sk_buff *skb_b,
+					  u8 meta_len)
+{
+	const void *a = skb_metadata_end(skb_a);
+	const void *b = skb_metadata_end(skb_b);
+	/* Using more efficient varaiant than plain call to memcmp(). */
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+	u64 diffs = 0;
+
+	switch (meta_len) {
+#define __it(x, op) (x -= sizeof(u##op))
+#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
+	case 32: diffs |= __it_diff(a, b, 64);
+	case 24: diffs |= __it_diff(a, b, 64);
+	case 16: diffs |= __it_diff(a, b, 64);
+	case  8: diffs |= __it_diff(a, b, 64);
+		break;
+	case 28: diffs |= __it_diff(a, b, 64);
+	case 20: diffs |= __it_diff(a, b, 64);
+	case 12: diffs |= __it_diff(a, b, 64);
+	case  4: diffs |= __it_diff(a, b, 32);
+		break;
+	}
+	return diffs;
+#else
+	return memcmp(a - meta_len, b - meta_len, meta_len);
+#endif
+}
+
+static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
+					const struct sk_buff *skb_b)
+{
+	u8 len_a = skb_metadata_len(skb_a);
+	u8 len_b = skb_metadata_len(skb_b);
+
+	if (!(len_a | len_b))
+		return false;
+
+	return len_a != len_b ?
+	       true : __skb_metadata_differs(skb_a, skb_b, len_a);
+}
+
+static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
+{
+	skb_shinfo(skb)->meta_len = meta_len;
+}
+
+static inline void skb_metadata_clear(struct sk_buff *skb)
+{
+	skb_metadata_set(skb, 0);
+}
+
 struct sk_buff *skb_clone_sk(struct sk_buff *skb);
 
 #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ab5c402f98..e43491ac4823 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -582,6 +582,12 @@ union bpf_attr {
  *	@map: pointer to sockmap to update
  *	@key: key to insert/update sock in map
  *	@flags: same flags as map update elem
+ *
+ * int bpf_xdp_adjust_meta(xdp_md, delta)
+ *     Adjust the xdp_md.data_meta by delta
+ *     @xdp_md: pointer to xdp_md
+ *     @delta: An positive/negative integer to be added to xdp_md.data_meta
+ *     Return: 0 on success or negative on error
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -638,6 +644,7 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
+	FN(xdp_adjust_meta),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -715,7 +722,7 @@ struct __sk_buff {
 	__u32 data_end;
 	__u32 napi_id;
 
-	/* accessed by BPF_PROG_TYPE_sk_skb types */
+	/* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */
 	__u32 family;
 	__u32 remote_ip4;	/* Stored in network byte order */
 	__u32 local_ip4;	/* Stored in network byte order */
@@ -723,6 +730,9 @@ struct __sk_buff {
 	__u32 local_ip6[4];	/* Stored in network byte order */
 	__u32 remote_port;	/* Stored in network byte order */
 	__u32 local_port;	/* stored in host byte order */
+	/* ... here. */
+
+	__u32 data_meta;
 };
 
 struct bpf_tunnel_key {
@@ -783,6 +793,7 @@ enum xdp_action {
 struct xdp_md {
 	__u32 data;
 	__u32 data_end;
+	__u32 data_meta;
 };
 
 enum sk_action {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b914fbe1383e..f849eca36052 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
 	va_end(args);
 }
 
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
+}
+
 /* string representation of 'enum bpf_reg_type' */
 static const char * const reg_type_str[] = {
 	[NOT_INIT]		= "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 			verbose("(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
 				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
+			if (type_is_pkt_pointer(t))
 				verbose(",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
@@ -519,6 +526,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -702,6 +734,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -1047,7 +1080,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
 		return check_pkt_ptr_alignment(reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
@@ -1124,8 +1160,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
 				mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1206,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		} else {
 			err = check_stack_read(state, off, size, value_regno);
 		}
-	} else if (reg->type == PTR_TO_PACKET) {
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose("cannot write into packet\n");
 			return -EACCES;
@@ -1310,6 +1346,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 		return check_packet_access(env, regno, reg->off, access_size);
 	case PTR_TO_MAP_VALUE:
 		return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
 		verbose("helper access to the packet is not allowed\n");
 		return -EACCES;
@@ -1351,7 +1388,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (register_is_null(*reg))
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1440,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->key_size);
 		else
@@ -1417,7 +1456,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose("invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
 						  meta->map_ptr->value_size);
 		else
@@ -1590,8 +1629,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
@@ -1600,18 +1639,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
+		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
-			continue;
-		__mark_reg_unknown(reg);
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
@@ -1871,7 +1907,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1931,7 +1967,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -2421,7 +2457,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 }
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
-				   struct bpf_reg_state *dst_reg)
+				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
@@ -2483,7 +2520,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
 
@@ -2491,7 +2528,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 		if (state->stack_slot_type[i] != STACK_SPILL)
 			continue;
 		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, dst_reg->off);
 	}
 }
@@ -2856,19 +2893,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(this_branch, dst_reg);
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
 		   dst_reg->type == PTR_TO_PACKET &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		find_good_pkt_pointers(other_branch, dst_reg);
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
 	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
 		   dst_reg->type == PTR_TO_PACKET_END &&
 		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+		   dst_reg->type == PTR_TO_PACKET_META &&
+		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
+	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
@@ -3298,8 +3355,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index df672517b4fd..a86e6687026e 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -162,6 +162,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	xdp.data_hard_start = data;
 	xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + size;
 
 	retval = bpf_test_run(prog, &xdp, repeat, &duration);
diff --git a/net/core/dev.c b/net/core/dev.c
index 97abddd9039a..e350c768d4b5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3864,8 +3864,8 @@ drop:
 static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
+	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	u32 act = XDP_DROP;
 	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
@@ -3876,8 +3876,25 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	if (skb_cloned(skb))
 		return XDP_PASS;
 
-	if (skb_linearize(skb))
-		goto do_drop;
+	/* XDP packets must be linear and must have sufficient headroom
+	 * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
+	 * native XDP provides, thus we need to do it here as well.
+	 */
+	if (skb_is_nonlinear(skb) ||
+	    skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+		int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+		int troom = skb->tail + skb->data_len - skb->end;
+
+		/* In case we have to go down the path and also linearize,
+		 * then lets do the pskb_expand_head() work just once here.
+		 */
+		if (pskb_expand_head(skb,
+				     hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+				     troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
+			goto do_drop;
+		if (troom > 0 && __skb_linearize(skb))
+			goto do_drop;
+	}
 
 	/* The XDP program wants to see the packet starting at the MAC
 	 * header.
@@ -3885,6 +3902,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	mac_len = skb->data - skb_mac_header(skb);
 	hlen = skb_headlen(skb) + mac_len;
 	xdp.data = skb->data - mac_len;
+	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
 	orig_data = xdp.data;
@@ -3902,10 +3920,12 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	case XDP_REDIRECT:
 	case XDP_TX:
 		__skb_push(skb, mac_len);
-		/* fall through */
+		break;
 	case XDP_PASS:
+		metalen = xdp.data - xdp.data_meta;
+		if (metalen)
+			skb_metadata_set(skb, metalen);
 		break;
-
 	default:
 		bpf_warn_invalid_xdp_action(act);
 		/* fall through */
@@ -4695,6 +4715,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
 		diffs |= p->vlan_tci ^ skb->vlan_tci;
 		diffs |= skb_metadata_dst_cmp(p, skb);
+		diffs |= skb_metadata_differs(p, skb);
 		if (maclen == ETH_HLEN)
 			diffs |= compare_ether_header(skb_mac_header(p),
 						      skb_mac_header(skb));
diff --git a/net/core/filter.c b/net/core/filter.c
index c468e7cfad19..9b6e7e84aafd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2447,14 +2447,26 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
+{
+	return xdp_data_meta_unsupported(xdp) ? 0 :
+	       xdp->data - xdp->data_meta;
+}
+
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	unsigned long metalen = xdp_get_metalen(xdp);
+	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
 
-	if (unlikely(data < xdp->data_hard_start ||
+	if (unlikely(data < data_start ||
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	if (metalen)
+		memmove(xdp->data_meta + offset,
+			xdp->data_meta, metalen);
+	xdp->data_meta += offset;
 	xdp->data = data;
 
 	return 0;
@@ -2468,6 +2480,33 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
+{
+	void *meta = xdp->data_meta + offset;
+	unsigned long metalen = xdp->data - meta;
+
+	if (xdp_data_meta_unsupported(xdp))
+		return -ENOTSUPP;
+	if (unlikely(meta < xdp->data_hard_start ||
+		     meta > xdp->data))
+		return -EINVAL;
+	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
+		     (metalen > 32)))
+		return -EACCES;
+
+	xdp->data_meta = meta;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
+	.func		= bpf_xdp_adjust_meta,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 static int __bpf_tx_xdp(struct net_device *dev,
 			struct bpf_map *map,
 			struct xdp_buff *xdp,
@@ -2692,7 +2731,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_clone_redirect ||
 	    func == bpf_l3_csum_replace ||
 	    func == bpf_l4_csum_replace ||
-	    func == bpf_xdp_adjust_head)
+	    func == bpf_xdp_adjust_head ||
+	    func == bpf_xdp_adjust_meta)
 		return true;
 
 	return false;
@@ -3288,6 +3328,8 @@ xdp_func_proto(enum bpf_func_id func_id)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_xdp_adjust_head:
 		return &bpf_xdp_adjust_head_proto;
+	case BPF_FUNC_xdp_adjust_meta:
+		return &bpf_xdp_adjust_meta_proto;
 	case BPF_FUNC_redirect:
 		return &bpf_xdp_redirect_proto;
 	case BPF_FUNC_redirect_map:
@@ -3418,6 +3460,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
 	case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
 	case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		if (size != size_default)
 			return false;
@@ -3444,6 +3487,7 @@ static bool sk_filter_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range(struct __sk_buff, data):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 	case bpf_ctx_range(struct __sk_buff, data_end):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
 		return false;
@@ -3468,6 +3512,7 @@ static bool lwt_is_valid_access(int off, int size,
 	switch (off) {
 	case bpf_ctx_range(struct __sk_buff, tc_classid):
 	case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
 		return false;
 	}
 
@@ -3586,6 +3631,9 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case bpf_ctx_range(struct __sk_buff, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3619,6 +3667,9 @@ static bool xdp_is_valid_access(int off, int size,
 	case offsetof(struct xdp_md, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		info->reg_type = PTR_TO_PACKET_META;
+		break;
 	case offsetof(struct xdp_md, data_end):
 		info->reg_type = PTR_TO_PACKET_END;
 		break;
@@ -3677,6 +3728,12 @@ static bool sk_skb_is_valid_access(int off, int size,
 				   enum bpf_access_type type,
 				   struct bpf_insn_access_aux *info)
 {
+	switch (off) {
+	case bpf_ctx_range(struct __sk_buff, tc_classid):
+	case bpf_ctx_range(struct __sk_buff, data_meta):
+		return false;
+	}
+
 	if (type == BPF_WRITE) {
 		switch (off) {
 		case bpf_ctx_range(struct __sk_buff, mark):
@@ -3689,8 +3746,6 @@ static bool sk_skb_is_valid_access(int off, int size,
 	}
 
 	switch (off) {
-	case bpf_ctx_range(struct __sk_buff, tc_classid):
-		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 		info->reg_type = PTR_TO_PACKET;
 		break;
@@ -3847,6 +3902,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
 				      offsetof(struct sk_buff, data));
 		break;
 
+	case offsetof(struct __sk_buff, data_meta):
+		off  = si->off;
+		off -= offsetof(struct __sk_buff, data_meta);
+		off += offsetof(struct sk_buff, cb);
+		off += offsetof(struct bpf_skb_data_end, data_meta);
+		*insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
+				      si->src_reg, off);
+		break;
+
 	case offsetof(struct __sk_buff, data_end):
 		off  = si->off;
 		off -= offsetof(struct __sk_buff, data_end);
@@ -4095,6 +4159,11 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct xdp_buff, data));
 		break;
+	case offsetof(struct xdp_md, data_meta):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct xdp_buff, data_meta));
+		break;
 	case offsetof(struct xdp_md, data_end):
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
 				      si->dst_reg, si->src_reg,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 000ce735fa8d..d98c2e3ce2bf 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1509,6 +1509,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	skb->nohdr    = 0;
 	atomic_set(&skb_shinfo(skb)->dataref, 1);
 
+	skb_metadata_clear(skb);
+
 	/* It is not generally safe to change skb->truesize.
 	 * For the moment, we really care of rx path, or
 	 * when skb is orphaned (not attached to a socket).
-- 
cgit v1.3-14-g43fede


From 63fef14fc98a8b4fad777fd3bef4d068802b3f14 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 18 Aug 2017 17:24:00 +0900
Subject: kprobes/x86: Make insn buffer always ROX and use text_poke()

Make insn buffer always ROX and use text_poke() to write
the copied instructions instead of set_memory_*().
This makes instruction buffer stronger against other
kernel subsystems because there is no window time
to modify the buffer.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150304463032.17009.14195368040691676813.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/common.h |  6 ++--
 arch/x86/kernel/kprobes/core.c   | 61 ++++++++++++++++++++++++-------------
 arch/x86/kernel/kprobes/opt.c    | 65 +++++++++++++++++++++++-----------------
 kernel/kprobes.c                 |  2 +-
 4 files changed, 81 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/kprobes/common.h b/arch/x86/kernel/kprobes/common.h
index db2182d63ed0..e2c2a1970869 100644
--- a/arch/x86/kernel/kprobes/common.h
+++ b/arch/x86/kernel/kprobes/common.h
@@ -75,11 +75,11 @@ extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
  * Copy an instruction and adjust the displacement if the instruction
  * uses the %rip-relative addressing mode.
  */
-extern int __copy_instruction(u8 *dest, u8 *src, struct insn *insn);
+extern int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn);
 
 /* Generate a relative-jump/call instruction */
-extern void synthesize_reljump(void *from, void *to);
-extern void synthesize_relcall(void *from, void *to);
+extern void synthesize_reljump(void *dest, void *from, void *to);
+extern void synthesize_relcall(void *dest, void *from, void *to);
 
 #ifdef	CONFIG_OPTPROBES
 extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index f0153714ddac..b48e0efd668e 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -119,29 +119,29 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 
 static nokprobe_inline void
-__synthesize_relative_insn(void *from, void *to, u8 op)
+__synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
 {
 	struct __arch_relative_insn {
 		u8 op;
 		s32 raddr;
 	} __packed *insn;
 
-	insn = (struct __arch_relative_insn *)from;
+	insn = (struct __arch_relative_insn *)dest;
 	insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
 	insn->op = op;
 }
 
 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
-void synthesize_reljump(void *from, void *to)
+void synthesize_reljump(void *dest, void *from, void *to)
 {
-	__synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
+	__synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
 }
 NOKPROBE_SYMBOL(synthesize_reljump);
 
 /* Insert a call instruction at address 'from', which calls address 'to'.*/
-void synthesize_relcall(void *from, void *to)
+void synthesize_relcall(void *dest, void *from, void *to)
 {
-	__synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
+	__synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
 }
 NOKPROBE_SYMBOL(synthesize_relcall);
 
@@ -346,10 +346,11 @@ static int is_IF_modifier(kprobe_opcode_t *insn)
 /*
  * Copy an instruction with recovering modified instruction by kprobes
  * and adjust the displacement if the instruction uses the %rip-relative
- * addressing mode.
+ * addressing mode. Note that since @real will be the final place of copied
+ * instruction, displacement must be adjust by @real, not @dest.
  * This returns the length of copied instruction, or 0 if it has an error.
  */
-int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
+int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
 {
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
 	unsigned long recovered_insn =
@@ -387,11 +388,11 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
 		 * have given.
 		 */
 		newdisp = (u8 *) src + (s64) insn->displacement.value
-			  - (u8 *) dest;
+			  - (u8 *) real;
 		if ((s64) (s32) newdisp != newdisp) {
 			pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
 			pr_err("\tSrc: %p, Dest: %p, old disp: %x\n",
-				src, dest, insn->displacement.value);
+				src, real, insn->displacement.value);
 			return 0;
 		}
 		disp = (u8 *) dest + insn_offset_displacement(insn);
@@ -402,20 +403,38 @@ int __copy_instruction(u8 *dest, u8 *src, struct insn *insn)
 }
 
 /* Prepare reljump right after instruction to boost */
-static void prepare_boost(struct kprobe *p, struct insn *insn)
+static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
+			  struct insn *insn)
 {
+	int len = insn->length;
+
 	if (can_boost(insn, p->addr) &&
-	    MAX_INSN_SIZE - insn->length >= RELATIVEJUMP_SIZE) {
+	    MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
 		/*
 		 * These instructions can be executed directly if it
 		 * jumps back to correct address.
 		 */
-		synthesize_reljump(p->ainsn.insn + insn->length,
+		synthesize_reljump(buf + len, p->ainsn.insn + len,
 				   p->addr + insn->length);
+		len += RELATIVEJUMP_SIZE;
 		p->ainsn.boostable = true;
 	} else {
 		p->ainsn.boostable = false;
 	}
+
+	return len;
+}
+
+/* Make page to RO mode when allocate it */
+void *alloc_insn_page(void)
+{
+	void *page;
+
+	page = module_alloc(PAGE_SIZE);
+	if (page)
+		set_memory_ro((unsigned long)page & PAGE_MASK, 1);
+
+	return page;
 }
 
 /* Recover page to RW mode before releasing it */
@@ -429,12 +448,11 @@ void free_insn_page(void *page)
 static int arch_copy_kprobe(struct kprobe *p)
 {
 	struct insn insn;
+	kprobe_opcode_t buf[MAX_INSN_SIZE];
 	int len;
 
-	set_memory_rw((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
-
 	/* Copy an instruction with recovering if other optprobe modifies it.*/
-	len = __copy_instruction(p->ainsn.insn, p->addr, &insn);
+	len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
 	if (!len)
 		return -EINVAL;
 
@@ -442,15 +460,16 @@ static int arch_copy_kprobe(struct kprobe *p)
 	 * __copy_instruction can modify the displacement of the instruction,
 	 * but it doesn't affect boostable check.
 	 */
-	prepare_boost(p, &insn);
-
-	set_memory_ro((unsigned long)p->ainsn.insn & PAGE_MASK, 1);
+	len = prepare_boost(buf, p, &insn);
 
 	/* Check whether the instruction modifies Interrupt Flag or not */
-	p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn);
+	p->ainsn.if_modifier = is_IF_modifier(buf);
 
 	/* Also, displacement change doesn't affect the first byte */
-	p->opcode = p->ainsn.insn[0];
+	p->opcode = buf[0];
+
+	/* OK, write back the instruction(s) into ROX insn buffer */
+	text_poke(p->ainsn.insn, buf, len);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 4f98aad38237..22e65f0b8b34 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -184,13 +184,13 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(optimized_callback);
 
-static int copy_optimized_instructions(u8 *dest, u8 *src)
+static int copy_optimized_instructions(u8 *dest, u8 *src, u8 *real)
 {
 	struct insn insn;
 	int len = 0, ret;
 
 	while (len < RELATIVEJUMP_SIZE) {
-		ret = __copy_instruction(dest + len, src + len, &insn);
+		ret = __copy_instruction(dest + len, src + len, real, &insn);
 		if (!ret || !can_boost(&insn, src + len))
 			return -EINVAL;
 		len += ret;
@@ -343,57 +343,66 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
 int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 				  struct kprobe *__unused)
 {
-	u8 *buf;
-	int ret;
+	u8 *buf = NULL, *slot;
+	int ret, len;
 	long rel;
 
 	if (!can_optimize((unsigned long)op->kp.addr))
 		return -EILSEQ;
 
-	op->optinsn.insn = get_optinsn_slot();
-	if (!op->optinsn.insn)
+	buf = kzalloc(MAX_OPTINSN_SIZE, GFP_KERNEL);
+	if (!buf)
 		return -ENOMEM;
 
+	op->optinsn.insn = slot = get_optinsn_slot();
+	if (!slot) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * Verify if the address gap is in 2GB range, because this uses
 	 * a relative jump.
 	 */
-	rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
+	rel = (long)slot - (long)op->kp.addr + RELATIVEJUMP_SIZE;
 	if (abs(rel) > 0x7fffffff) {
-		__arch_remove_optimized_kprobe(op, 0);
-		return -ERANGE;
+		ret = -ERANGE;
+		goto err;
 	}
 
-	buf = (u8 *)op->optinsn.insn;
-	set_memory_rw((unsigned long)buf & PAGE_MASK, 1);
+	/* Copy arch-dep-instance from template */
+	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
 
 	/* Copy instructions into the out-of-line buffer */
-	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
-	if (ret < 0) {
-		__arch_remove_optimized_kprobe(op, 0);
-		return ret;
-	}
+	ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr,
+					  slot + TMPL_END_IDX);
+	if (ret < 0)
+		goto err;
 	op->optinsn.size = ret;
-
-	/* Copy arch-dep-instance from template */
-	memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
+	len = TMPL_END_IDX + op->optinsn.size;
 
 	/* Set probe information */
 	synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
 
 	/* Set probe function call */
-	synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
+	synthesize_relcall(buf + TMPL_CALL_IDX,
+			   slot + TMPL_CALL_IDX, optimized_callback);
 
 	/* Set returning jmp instruction at the tail of out-of-line buffer */
-	synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
+	synthesize_reljump(buf + len, slot + len,
 			   (u8 *)op->kp.addr + op->optinsn.size);
-
-	set_memory_ro((unsigned long)buf & PAGE_MASK, 1);
-
-	flush_icache_range((unsigned long) buf,
-			   (unsigned long) buf + TMPL_END_IDX +
-			   op->optinsn.size + RELATIVEJUMP_SIZE);
-	return 0;
+	len += RELATIVEJUMP_SIZE;
+
+	/* We have to use text_poke for instuction buffer because it is RO */
+	text_poke(slot, buf, len);
+	ret = 0;
+out:
+	kfree(buf);
+	return ret;
+
+err:
+	__arch_remove_optimized_kprobe(op, 0);
+	goto out;
 }
 
 /*
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..15fba7fe57c8 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
 	SLOT_USED = 2,
 };
 
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
 {
 	return module_alloc(PAGE_SIZE);
 }
-- 
cgit v1.3-14-g43fede


From 3539d09154e11336c31a900a9cd49e386ba6d9b2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 19 Sep 2017 18:59:00 +0900
Subject: kprobes: Improve smoke test to check preemptibility

Add preemptible check to each handler. Handlers are called with
non-preemtible, which is guaranteed by Documentation/kprobes.txt.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150581513991.32348.7956810394499654272.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/test_kprobes.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 0dbab6d1acb4..47106a1e645a 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value)
 
 static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
+	if (preemptible()) {
+		handler_errors++;
+		pr_err("pre-handler is preemptible\n");
+	}
 	preh_val = (rand1 / div_factor);
 	return 0;
 }
@@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
 		unsigned long flags)
 {
+	if (preemptible()) {
+		handler_errors++;
+		pr_err("post-handler is preemptible\n");
+	}
 	if (preh_val != (rand1 / div_factor)) {
 		handler_errors++;
 		pr_err("incorrect value in post_handler\n");
@@ -156,6 +164,10 @@ static int test_kprobes(void)
 
 static u32 j_kprobe_target(u32 value)
 {
+	if (preemptible()) {
+		handler_errors++;
+		pr_err("jprobe-handler is preemptible\n");
+	}
 	if (value != rand1) {
 		handler_errors++;
 		pr_err("incorrect value in jprobe handler\n");
@@ -232,6 +244,10 @@ static u32 krph_val;
 
 static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
+	if (preemptible()) {
+		handler_errors++;
+		pr_err("kretprobe entry handler is preemptible\n");
+	}
 	krph_val = (rand1 / div_factor);
 	return 0;
 }
@@ -240,6 +256,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
 {
 	unsigned long ret = regs_return_value(regs);
 
+	if (preemptible()) {
+		handler_errors++;
+		pr_err("kretprobe return handler is preemptible\n");
+	}
 	if (ret != (rand1 / div_factor)) {
 		handler_errors++;
 		pr_err("incorrect value in kretprobe handler\n");
-- 
cgit v1.3-14-g43fede


From e863d5396146411b615231cae0c518cb2a23371c Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 19 Sep 2017 19:00:19 +0900
Subject: kprobes: Warn if optprobe handler tries to change execution path

Warn if optprobe handler tries to change execution path.
As described in Documentation/kprobes.txt, with optprobe
user handler can not change instruction pointer. In that
case user must avoid optimizing the kprobes by setting
post_handler or break_handler.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150581521955.32348.3615624715034787365.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 15fba7fe57c8..2d28377a0e32 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -387,7 +387,10 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
-			kp->pre_handler(kp, regs);
+			if (kp->pre_handler(kp, regs)) {
+				if (WARN_ON_ONCE(1))
+					pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr);
+			}
 		}
 		reset_kprobe_instance();
 	}
-- 
cgit v1.3-14-g43fede


From 2b7c6ba945fd3c10ca3e030be402098aff2f89d3 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 26 Sep 2017 16:35:13 +0100
Subject: bpf/verifier: improve disassembly of BPF_END instructions

print_bpf_insn() was treating all BPF_ALU[64] the same, but BPF_END has a
 different structure: it has a size in insn->imm (even if it's BPF_X) and
 uses the BPF_SRC (X or K) to indicate which endianness to use.  So it
 needs different code to print it.

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f849eca36052..e8d7bb8e6b98 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -332,26 +332,40 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
+static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
 static void print_bpf_insn(const struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
 
 	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_SRC(insn->code) == BPF_X)
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose("BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(env, insn);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose("(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
-		else
+		} else {
 			verbose("(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->imm);
+		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
 			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-- 
cgit v1.3-14-g43fede


From 73c864b38383f4abc9b559025cd663f4a81afa89 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Tue, 26 Sep 2017 16:35:29 +0100
Subject: bpf/verifier: improve disassembly of BPF_NEG instructions

BPF_NEG takes only one operand, unlike the bulk of BPF_ALU[64] which are
 compound-assignments.  So give it its own format in print_bpf_insn().

Signed-off-by: Edward Cree <ecree@solarflare.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e8d7bb8e6b98..4cf9b72c59a0 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -351,6 +351,11 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 				verbose("BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose("(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
 			verbose("(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
-- 
cgit v1.3-14-g43fede


From cb4d2b3f03d8eed90be3a194e5b54b734ec4bbe9 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:52 -0700
Subject: bpf: Add name, load_time, uid and map_ids to bpf_prog_info

The patch adds name and load_time to struct bpf_prog_aux.  They
are also exported to bpf_prog_info.

The bpf_prog's name is passed by userspace during BPF_PROG_LOAD.
The kernel only stores the first (BPF_PROG_NAME_LEN - 1) bytes
and the name stored in the kernel is always \0 terminated.

The kernel will reject name that contains characters other than
isalnum() and '_'.  It will also reject name that is not null
terminated.

The existing 'user->uid' of the bpf_prog_aux is also exported to
the bpf_prog_info as created_by_uid.

The existing 'used_maps' of the bpf_prog_aux is exported to
the newly added members 'nr_map_ids' and 'map_ids' of
the bpf_prog_info.  On the input, nr_map_ids tells how
big the userspace's map_ids buffer is.  On the output,
nr_map_ids tells the exact user_map_cnt and it will only
copy up to the userspace's map_ids buffer is allowed.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  2 ++
 include/uapi/linux/bpf.h |  8 ++++++++
 kernel/bpf/syscall.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 60 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2b672c50f160..33ccc474fb04 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -187,6 +187,8 @@ struct bpf_prog_aux {
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
+	u64 load_time; /* ns since boottime */
+	u8 name[BPF_OBJ_NAME_LEN];
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e43491ac4823..bd6348269bf5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -175,6 +175,8 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+#define BPF_OBJ_NAME_LEN 16U
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -210,6 +212,7 @@ union bpf_attr {
 		__aligned_u64	log_buf;	/* user supplied buffer */
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
+		__u8		prog_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
@@ -812,6 +815,11 @@ struct bpf_prog_info {
 	__u32 xlated_prog_len;
 	__aligned_u64 jited_prog_insns;
 	__aligned_u64 xlated_prog_insns;
+	__u64 load_time;	/* ns since boottime */
+	__u32 created_by_uid;
+	__u32 nr_map_ids;
+	__aligned_u64 map_ids;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..45970df3f820 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -312,6 +315,30 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	/* '\0' terminates dst */
+	*dst = 0;
+
+	return 0;
+}
+
 #define BPF_MAP_CREATE_LAST_FIELD numa_node
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
@@ -973,7 +1000,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_name
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1037,6 +1064,11 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1358,8 +1390,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
-- 
cgit v1.3-14-g43fede


From ad5b177bd73f5107d97c36f56395c4281fb6f089 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Wed, 27 Sep 2017 14:37:53 -0700
Subject: bpf: Add map_name to bpf_map_info

This patch allows userspace to specify a name for a map
during BPF_MAP_CREATE.

The map's name can later be exported to user space
via BPF_OBJ_GET_INFO_BY_FD.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 1 +
 include/uapi/linux/bpf.h | 2 ++
 kernel/bpf/syscall.c     | 7 ++++++-
 3 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 33ccc474fb04..252f4bc9eb25 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -56,6 +56,7 @@ struct bpf_map {
 	struct work_struct work;
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
+	u8 name[BPF_OBJ_NAME_LEN];
 };
 
 /* function argument constraints */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index bd6348269bf5..6d2137b4cf38 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -190,6 +190,7 @@ union bpf_attr {
 		__u32	numa_node;	/* numa node (effective only if
 					 * BPF_F_NUMA_NODE is set).
 					 */
+		__u8	map_name[BPF_OBJ_NAME_LEN];
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
@@ -829,6 +830,7 @@ struct bpf_map_info {
 	__u32 value_size;
 	__u32 max_entries;
 	__u32 map_flags;
+	__u8  name[BPF_OBJ_NAME_LEN];
 } __attribute__((aligned(8)));
 
 /* User bpf_sock_ops struct to access socket values and specify request ops
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 45970df3f820..11a7f82a55d1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -339,7 +339,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	return 0;
 }
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
@@ -361,6 +361,10 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
@@ -1462,6 +1466,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.3-14-g43fede


From 5bce9db1894c998c5b85a34036d679ea6517668f Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 29 Aug 2017 17:01:03 +0300
Subject: perf/core: Explain perf_sched_mutex

To clarify why atomic_inc_return(&perf_sched_events) is not sufficient and
a mutex is needed to order static branch enabling vs the atomic counter
increment, this adds a comment with a short explanation.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170829140103.6563-1-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..5ee62714f9a6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9394,6 +9394,11 @@ static void account_event(struct perf_event *event)
 		inc = true;
 
 	if (inc) {
+		/*
+		 * We need the mutex here because static_branch_enable()
+		 * must complete *before* the perf_sched_count increment
+		 * becomes visible.
+		 */
 		if (atomic_inc_not_zero(&perf_sched_count))
 			goto enabled;
 
-- 
cgit v1.3-14-g43fede


From 7c80cfc99b7bfdc92cee26f8008859f326f4a37f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:03:17 +0200
Subject: sched/fair: Clean up calc_cfs_shares()

For consistencies sake, we should have only a single reading of tg->shares.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ba32e08a23..048fbfb75523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2694,9 +2694,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 {
-	long tg_weight, load, shares;
+	long tg_weight, tg_shares, load, shares;
+	struct task_group *tg = cfs_rq->tg;
+
+	tg_shares = READ_ONCE(tg->shares);
 
 	/*
 	 * This really should be: cfs_rq->avg.load_avg, but instead we use
@@ -2711,7 +2714,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
 	tg_weight += load;
 
-	shares = (tg->shares * load);
+	shares = (tg_shares * load);
 	if (tg_weight)
 		shares /= tg_weight;
 
@@ -2727,17 +2730,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
 	 * instead of 0.
 	 */
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	if (shares > tg->shares)
-		shares = tg->shares;
-
-	return shares;
-}
-# else /* CONFIG_SMP */
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
-	return tg->shares;
+	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
 # endif /* CONFIG_SMP */
 
@@ -2762,7 +2755,6 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 static void update_cfs_shares(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	struct task_group *tg;
 	long shares;
 
 	if (!cfs_rq)
@@ -2771,13 +2763,14 @@ static void update_cfs_shares(struct sched_entity *se)
 	if (throttled_hierarchy(cfs_rq))
 		return;
 
-	tg = cfs_rq->tg;
-
 #ifndef CONFIG_SMP
-	if (likely(se->load.weight == tg->shares))
+	shares = READ_ONCE(cfs_rq->tg->shares);
+
+	if (likely(se->load.weight == shares))
 		return;
+#else
+	shares = calc_cfs_shares(cfs_rq);
 #endif
-	shares = calc_cfs_shares(cfs_rq, tg);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
-- 
cgit v1.3-14-g43fede


From cef27403cbe98ebda0a32d43128dd60c309eb966 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 9 May 2017 11:04:07 +0200
Subject: sched/fair: Add comment to calc_cfs_shares()

Explain the magic equation in calc_cfs_shares() a bit better.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 048fbfb75523..dd565aeafc5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2694,6 +2694,67 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
+/*
+ * All this does is approximate the hierarchical proportion which includes that
+ * global sum we all love to hate.
+ *
+ * That is, the weight of a group entity, is the proportional share of the
+ * group weight based on the group runqueue weights. That is:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------               (1)
+ *			  \Sum grq->load.weight
+ *
+ * Now, because computing that sum is prohibitively expensive to compute (been
+ * there, done that) we approximate it with this average stuff. The average
+ * moves slower and therefore the approximation is cheaper and more stable.
+ *
+ * So instead of the above, we substitute:
+ *
+ *   grq->load.weight -> grq->avg.load_avg                         (2)
+ *
+ * which yields the following:
+ *
+ *                     tg->weight * grq->avg.load_avg
+ *   ge->load.weight = ------------------------------              (3)
+ *				tg->load_avg
+ *
+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
+ *
+ * That is shares_avg, and it is right (given the approximation (2)).
+ *
+ * The problem with it is that because the average is slow -- it was designed
+ * to be exactly that of course -- this leads to transients in boundary
+ * conditions. In specific, the case where the group was idle and we start the
+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
+ * yielding bad latency etc..
+ *
+ * Now, in that special case (1) reduces to:
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = ----------------------------- = tg>weight   (4)
+ *			    grp->load.weight
+ *
+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
+ *
+ * So what we do is modify our approximation (3) to approach (4) in the (near)
+ * UP case, like:
+ *
+ *   ge->load.weight =
+ *
+ *              tg->weight * grq->load.weight
+ *     ---------------------------------------------------         (5)
+ *     tg->load_avg - grq->avg.load_avg + grq->load.weight
+ *
+ *
+ * And that is shares_weight and is icky. In the (near) UP case it approaches
+ * (4) while in the normal case it approaches (3). It consistently
+ * overestimates the ge->load.weight and therefore:
+ *
+ *   \Sum ge->load.weight >= tg->weight
+ *
+ * hence icky!
+ */
 static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 {
 	long tg_weight, tg_shares, load, shares;
-- 
cgit v1.3-14-g43fede


From 3d4b60d3e3dde6ea24e439000eb3b71078da81f1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 May 2017 18:16:06 +0200
Subject: sched/fair: Cure calc_cfs_shares() vs. reweight_entity()

Vincent reported that when running in a cgroup, his root
cfs_rq->avg.load_avg dropped to 0 on task idle.

This is because reweight_entity() will now immediately propagate the
weight change of the group entity to its cfs_rq, and as it happens,
our approxmation (5) for calc_cfs_shares() results in 0 when the group
is idle.

Avoid this by using the correct (3) as a lower bound on (5). This way
the empty cgroup will slowly decay instead of instantly drop to 0.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd565aeafc5a..63166a0ed854 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2763,11 +2763,10 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 	tg_shares = READ_ONCE(tg->shares);
 
 	/*
-	 * This really should be: cfs_rq->avg.load_avg, but instead we use
-	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
-	 * the shares for small weight interactive tasks.
+	 * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3)
+	 * as a lower bound.
 	 */
-	load = scale_load_down(cfs_rq->load.weight);
+	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
 	tg_weight = atomic_long_read(&tg->load_avg);
 
-- 
cgit v1.3-14-g43fede


From c7b50216818ef3dca14a52e3499750fbad2d9691 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:42:08 +0200
Subject: sched/fair: Remove se->load.weight from se->avg.load_sum

Remove the load from the load_sum for sched_entities, basically
turning load_sum into runnable_sum.  This prepares for better
reweighting of group entities.

Since we now have different rules for computing load_avg, split
___update_load_avg() into two parts, ___update_load_sum() and
___update_load_avg().

So for se:

  ___update_load_sum(.weight = 1)
  ___upate_load_avg(.weight = se->load.weight)

and for cfs_rq:

  ___update_load_sum(.weight = cfs_rq->load.weight)
  ___upate_load_avg(.weight = 1)

Since the primary consumable is load_avg, most things will not be
affected. Only those few sites that initialize/modify load_sum need
attention.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 91 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 63166a0ed854..3b5b82345774 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -731,7 +731,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 */
 	if (entity_is_task(se))
 		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+	sa->load_sum = LOAD_AVG_MAX;
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2025,7 +2025,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 		delta = runtime - p->last_sum_exec_runtime;
 		*period = now - p->last_task_numa_placement;
 	} else {
-		delta = p->se.avg.load_sum / p->se.load.weight;
+		delta = p->se.avg.load_sum;
 		*period = LOAD_AVG_MAX;
 	}
 
@@ -3018,8 +3018,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
  */
 static __always_inline int
-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
 	u64 delta;
 
@@ -3065,39 +3065,80 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
 		return 0;
 
+	return 1;
+}
+
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+{
+	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+	sa->load_avg = div_u64(weight * sa->load_sum, divider);
 	if (cfs_rq) {
 		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+			div_u64(cfs_rq->runnable_load_sum, divider);
 	}
-	sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
+	sa->util_avg = sa->util_sum / divider;
+}
 
-	return 1;
+/*
+ * XXX we want to get rid of this helper and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+	return scale_load_down(se->load.weight);
 }
 
+/*
+ * sched_entity:
+ *
+ *   load_sum := runnable_sum
+ *   load_avg = se_weight(se) * runnable_avg
+ *
+ * cfq_rs:
+ *
+ *   load_sum = \Sum se_weight(se) * se->avg.load_sum
+ *   load_avg = \Sum se->avg.load_avg
+ */
+
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
+		___update_load_avg(&se->avg, se_weight(se), NULL);
+		return 1;
+	}
+
+	return 0;
 }
 
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	return ___update_load_avg(now, cpu, &se->avg,
-				  se->on_rq * scale_load_down(se->load.weight),
-				  cfs_rq->curr == se, NULL);
+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
+				cfs_rq->curr == se, NULL)) {
+
+		___update_load_avg(&se->avg, se_weight(se), NULL);
+		return 1;
+	}
+
+	return 0;
 }
 
 static int
 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
-	return ___update_load_avg(now, cpu, &cfs_rq->avg,
-			scale_load_down(cfs_rq->load.weight),
-			cfs_rq->curr != NULL, cfs_rq);
+	if (___update_load_sum(now, cpu, &cfs_rq->avg,
+				scale_load_down(cfs_rq->load.weight),
+				cfs_rq->curr != NULL, cfs_rq)) {
+		___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+		return 1;
+	}
+
+	return 0;
 }
 
 /*
@@ -3267,7 +3308,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 	/* Set new sched_entity's load */
 	se->avg.load_avg = load;
-	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
+	se->avg.load_sum = LOAD_AVG_MAX;
 
 	/* Update parent cfs_rq load */
 	add_positive(&cfs_rq->avg.load_avg, delta);
@@ -3473,7 +3514,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 {
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
 	cfs_rq->avg.load_avg += se->avg.load_avg;
-	cfs_rq->avg.load_sum += se->avg.load_sum;
+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
 	set_tg_cfs_propagate(cfs_rq);
@@ -3493,7 +3534,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 {
 
 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 	set_tg_cfs_propagate(cfs_rq);
@@ -3505,12 +3546,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	struct sched_avg *sa = &se->avg;
-
-	cfs_rq->runnable_load_avg += sa->load_avg;
-	cfs_rq->runnable_load_sum += sa->load_sum;
+	cfs_rq->runnable_load_avg += se->avg.load_avg;
+	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
 
-	if (!sa->last_update_time) {
+	if (!se->avg.last_update_time) {
 		attach_entity_load_avg(cfs_rq, se);
 		update_tg_load_avg(cfs_rq, 0);
 	}
@@ -3520,10 +3559,8 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 static inline void
 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	cfs_rq->runnable_load_avg =
-		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
-	cfs_rq->runnable_load_sum =
-		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
+	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
 }
 
 #ifndef CONFIG_64BIT
-- 
cgit v1.3-14-g43fede


From 88c0616ee729067ecb412bed76ef4a8734ea5100 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 17:32:43 +0200
Subject: sched/fair: Change update_load_avg() arguments

Most call sites of update_load_avg() already have cfs_rq_of(se)
available, pass it down instead of recomputing it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b5b82345774..a6c53580fea0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3480,9 +3480,8 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 #define SKIP_AGE_LOAD	0x2
 
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	u64 now = cfs_rq_clock_task(cfs_rq);
 	struct rq *rq = rq_of(cfs_rq);
 	int cpu = cpu_of(rq);
@@ -3643,9 +3642,9 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
 
-static inline void update_load_avg(struct sched_entity *se, int not_used1)
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
-	cfs_rq_util_change(cfs_rq_of(se));
+	cfs_rq_util_change(cfs_rq);
 }
 
 static inline void
@@ -3796,7 +3795,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     its group cfs_rq
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
-	update_load_avg(se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
@@ -3880,7 +3879,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - For group entity, update its weight to reflect the new share
 	 *     of its group cfs_rq.
 	 */
-	update_load_avg(se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se, flags);
@@ -3968,7 +3967,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 */
 		update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
@@ -4070,7 +4069,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
-		update_load_avg(prev, 0);
+		update_load_avg(cfs_rq, prev, 0);
 	}
 	cfs_rq->curr = NULL;
 }
@@ -4086,7 +4085,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	/*
 	 * Ensure that runnable average is periodically updated.
 	 */
-	update_load_avg(curr, UPDATE_TG);
+	update_load_avg(cfs_rq, curr, UPDATE_TG);
 	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
@@ -5004,7 +5003,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 		update_cfs_shares(se);
 	}
 
@@ -5063,7 +5062,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 		update_cfs_shares(se);
 	}
 
@@ -7121,7 +7120,7 @@ static void update_blocked_averages(int cpu)
 		/* Propagate pending load changes to the parent, if any: */
 		se = cfs_rq->tg->se[cpu];
 		if (se && !skip_blocked_update(se))
-			update_load_avg(se, 0);
+			update_load_avg(cfs_rq_of(se), se, 0);
 
 		/*
 		 * There can be a lot of idle CPU cgroups.  Don't let fully
@@ -9295,7 +9294,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
-		update_load_avg(se, UPDATE_TG);
+		update_load_avg(cfs_rq, se, UPDATE_TG);
 	}
 }
 #else
@@ -9307,7 +9306,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	/* Catch up with the cfs_rq and remove our load when we leave */
-	update_load_avg(se, 0);
+	update_load_avg(cfs_rq, se, 0);
 	detach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 	propagate_entity_cfs_rq(se);
@@ -9326,7 +9325,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 #endif
 
 	/* Synchronize entity with its cfs_rq */
-	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
+	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
 	attach_entity_load_avg(cfs_rq, se);
 	update_tg_load_avg(cfs_rq, false);
 	propagate_entity_cfs_rq(se);
@@ -9610,7 +9609,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		rq_lock_irqsave(rq, &rf);
 		update_rq_clock(rq);
 		for_each_sched_entity(se) {
-			update_load_avg(se, UPDATE_TG);
+			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
 			update_cfs_shares(se);
 		}
 		rq_unlock_irqrestore(rq, &rf);
-- 
cgit v1.3-14-g43fede


From b382a531b9fece229c8358a9fb79431cddcf93c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 17:37:03 +0200
Subject: sched/fair: Move enqueue migrate handling

Move the entity migrate handling from enqueue_entity_load_avg() to
update_load_avg(). This has two benefits:

 - {en,de}queue_entity_load_avg() will become purely about managing
   runnable_load

 - we can avoid a double update_tg_load_avg() and reduce pressure on
   the global tg->shares cacheline

The reason we do this is so that we can change update_cfs_shares() to
change both weight and (future) runnable_weight. For this to work we
need to have the cfs_rq averages up-to-date (which means having done
the attach), but we need the cfs_rq->avg.runnable_avg to not yet
include the se's contribution (since se->on_rq == 0).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 70 +++++++++++++++++++++++++++--------------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a6c53580fea0..605b7c3e1ab8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3473,34 +3473,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 	return decayed || removed_load;
 }
 
-/*
- * Optional action to be done while updating the load average
- */
-#define UPDATE_TG	0x1
-#define SKIP_AGE_LOAD	0x2
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-	u64 now = cfs_rq_clock_task(cfs_rq);
-	struct rq *rq = rq_of(cfs_rq);
-	int cpu = cpu_of(rq);
-	int decayed;
-
-	/*
-	 * Track task load average for carrying it to new CPU after migrated, and
-	 * track group sched_entity load average for task_h_load calc in migration
-	 */
-	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
-		__update_load_avg_se(now, cpu, cfs_rq, se);
-
-	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
-	decayed |= propagate_entity_load_avg(se);
-
-	if (decayed && (flags & UPDATE_TG))
-		update_tg_load_avg(cfs_rq, 0);
-}
-
 /**
  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
  * @cfs_rq: cfs_rq to attach to
@@ -3541,17 +3513,46 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq_util_change(cfs_rq);
 }
 
+/*
+ * Optional action to be done while updating the load average
+ */
+#define UPDATE_TG	0x1
+#define SKIP_AGE_LOAD	0x2
+#define DO_ATTACH	0x4
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+	u64 now = cfs_rq_clock_task(cfs_rq);
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+	int decayed;
+
+	/*
+	 * Track task load average for carrying it to new CPU after migrated, and
+	 * track group sched_entity load average for task_h_load calc in migration
+	 */
+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
+		__update_load_avg_se(now, cpu, cfs_rq, se);
+
+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
+	decayed |= propagate_entity_load_avg(se);
+
+	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
+
+		attach_entity_load_avg(cfs_rq, se);
+		update_tg_load_avg(cfs_rq, 0);
+
+	} else if (decayed && (flags & UPDATE_TG))
+		update_tg_load_avg(cfs_rq, 0);
+}
+
 /* Add the load generated by se into cfs_rq's load average */
 static inline void
 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->runnable_load_avg += se->avg.load_avg;
 	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
-
-	if (!se->avg.last_update_time) {
-		attach_entity_load_avg(cfs_rq, se);
-		update_tg_load_avg(cfs_rq, 0);
-	}
 }
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
@@ -3641,6 +3642,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 #define UPDATE_TG	0x0
 #define SKIP_AGE_LOAD	0x0
+#define DO_ATTACH	0x0
 
 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
 {
@@ -3795,7 +3797,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     its group cfs_rq
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
-	update_load_avg(cfs_rq, se, UPDATE_TG);
+	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
 	enqueue_entity_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-- 
cgit v1.3-14-g43fede


From b5b3e35f4149df72aaba612bba195fb2ec37b1b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 17:38:30 +0200
Subject: sched/fair: Rename {en,de}queue_entity_load_avg()

Since they're now purely about runnable_load, rename them.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 605b7c3e1ab8..fad77c8454e4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3549,7 +3549,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
 /* Add the load generated by se into cfs_rq's load average */
 static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	cfs_rq->runnable_load_avg += se->avg.load_avg;
 	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
@@ -3557,7 +3557,7 @@ enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
 static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
 	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
@@ -3650,9 +3650,9 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 }
 
 static inline void
-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void
-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
@@ -3798,7 +3798,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
-	enqueue_entity_load_avg(cfs_rq, se);
+	enqueue_runnable_load_avg(cfs_rq, se);
 	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
 
@@ -3882,7 +3882,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *     of its group cfs_rq.
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG);
-	dequeue_entity_load_avg(cfs_rq, se);
+	dequeue_runnable_load_avg(cfs_rq, se);
 
 	update_stats_dequeue(cfs_rq, se, flags);
 
-- 
cgit v1.3-14-g43fede


From 8d5b9025f9b4500f828260dc62e8ffa823ce0d59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 17:45:35 +0200
Subject: sched/fair: Introduce {en,de}queue_load_avg()

Analogous to the existing {en,de}queue_runnable_load_avg() add helpers
for {en,de}queue_load_avg(). More users will follow.

Includes some code movement to avoid fwd declarations.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 86 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fad77c8454e4..654d8e3d6047 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2692,6 +2692,90 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->nr_running--;
 }
 
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do {                           \
+	typeof(_ptr) ptr = (_ptr);                              \
+	typeof(_val) val = (_val);                              \
+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+								\
+	res = var + val;                                        \
+								\
+	if (val < 0 && res > var)                               \
+		res = 0;                                        \
+								\
+	WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {				\
+	typeof(_ptr) ptr = (_ptr);				\
+	typeof(*ptr) val = (_val);				\
+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
+	res = var - val;					\
+	if (res > var)						\
+		res = 0;					\
+	WRITE_ONCE(*ptr, res);					\
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of this helper and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+	return scale_load_down(se->load.weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->runnable_load_avg += se->avg.load_avg;
+	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->avg.load_avg += se->avg.load_avg;
+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
 /*
@@ -3084,14 +3168,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cf
 	sa->util_avg = sa->util_sum / divider;
 }
 
-/*
- * XXX we want to get rid of this helper and use the full load resolution.
- */
-static inline long se_weight(struct sched_entity *se)
-{
-	return scale_load_down(se->load.weight);
-}
-
 /*
  * sched_entity:
  *
@@ -3141,26 +3217,6 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 	return 0;
 }
 
-/*
- * Signed add and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define add_positive(_ptr, _val) do {                           \
-	typeof(_ptr) ptr = (_ptr);                              \
-	typeof(_val) val = (_val);                              \
-	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
-								\
-	res = var + val;                                        \
-								\
-	if (val < 0 && res > var)                               \
-		res = 0;                                        \
-								\
-	WRITE_ONCE(*ptr, res);                                  \
-} while (0)
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
  * update_tg_load_avg - update the tg's load avg
@@ -3405,23 +3461,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do {				\
-	typeof(_ptr) ptr = (_ptr);				\
-	typeof(*ptr) val = (_val);				\
-	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
-	res = var - val;					\
-	if (res > var)						\
-		res = 0;					\
-	WRITE_ONCE(*ptr, res);					\
-} while (0)
-
 /**
  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
  * @now: current time, as per cfs_rq_clock_task()
@@ -3484,8 +3523,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
-	cfs_rq->avg.load_avg += se->avg.load_avg;
-	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
 	set_tg_cfs_propagate(cfs_rq);
@@ -3503,9 +3541,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
  */
 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-
-	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
 	set_tg_cfs_propagate(cfs_rq);
@@ -3547,22 +3583,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 		update_tg_load_avg(cfs_rq, 0);
 }
 
-/* Add the load generated by se into cfs_rq's load average */
-static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	cfs_rq->runnable_load_avg += se->avg.load_avg;
-	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
-}
-
-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
-}
-
 #ifndef CONFIG_64BIT
 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 {
@@ -3649,10 +3669,6 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	cfs_rq_util_change(cfs_rq);
 }
 
-static inline void
-enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
-static inline void
-dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
 
 static inline void
-- 
cgit v1.3-14-g43fede


From 840c5abca499a858619954dbcffc82110bb6e076 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 16:11:34 +0200
Subject: sched/fair: More accurate reweight_entity()

When a (group) entity changes it's weight we should instantly change
its load_avg and propagate that change into the sums it is part of.
Because we use these values to predict future behaviour and are not
interested in its historical value.

Without this change, the change in load would need to propagate
through the average, by which time it could again have changed etc..
always chasing itself.

With this change, the cfs_rq load_avg sum will more accurately reflect
the current runnable and expected return of blocked load.

Reported-by: Paul Turner <pjt@google.com>
[josef: compile fix !SMP || !FAIR_GROUP]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 654d8e3d6047..750ae4dbf812 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2886,12 +2886,22 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 		if (cfs_rq->curr == se)
 			update_curr(cfs_rq);
 		account_entity_dequeue(cfs_rq, se);
+		dequeue_runnable_load_avg(cfs_rq, se);
 	}
+	dequeue_load_avg(cfs_rq, se);
 
 	update_load_set(&se->load, weight);
 
-	if (se->on_rq)
+#ifdef CONFIG_SMP
+	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
+				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+#endif
+
+	enqueue_load_avg(cfs_rq, se);
+	if (se->on_rq) {
 		account_entity_enqueue(cfs_rq, se);
+		enqueue_runnable_load_avg(cfs_rq, se);
+	}
 }
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-- 
cgit v1.3-14-g43fede


From 9059393e4ec1c8c6623a120b405ef2c90b968d80 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Wed, 17 May 2017 11:50:45 +0200
Subject: sched/fair: Use reweight_entity() for set_user_nice()

Now that we directly change load_avg and propagate that change into
the sums, sys_nice() and co should do the same, otherwise its possible
to confuse load accounting when we migrate near the weight change.

Fixes-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
[ Added changelog, fixed the call condition. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20170517095045.GA8420@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 22 ++++++++++++------
 kernel/sched/fair.c  | 63 ++++++++++++++++++++++++++++++----------------------
 kernel/sched/sched.h |  2 ++
 3 files changed, 54 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..2288a145bf01 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -733,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
 
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
 {
 	int prio = p->static_prio - MAX_RT_PRIO;
 	struct load_weight *load = &p->se.load;
@@ -747,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
 		return;
 	}
 
-	load->weight = scale_load(sched_prio_to_weight[prio]);
-	load->inv_weight = sched_prio_to_wmult[prio];
+	/*
+	 * SCHED_OTHER tasks have to update their load when changing their
+	 * weight
+	 */
+	if (update_load && p->sched_class == &fair_sched_class) {
+		reweight_task(p, prio);
+	} else {
+		load->weight = scale_load(sched_prio_to_weight[prio]);
+		load->inv_weight = sched_prio_to_wmult[prio];
+	}
 }
 
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2358,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 			p->static_prio = NICE_TO_PRIO(0);
 
 		p->prio = p->normal_prio = __normal_prio(p);
-		set_load_weight(p);
+		set_load_weight(p, false);
 
 		/*
 		 * We don't need the reset flag anymore after the fork. It has
@@ -3805,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
 		put_prev_task(rq, p);
 
 	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p);
+	set_load_weight(p, true);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
 	delta = p->prio - old_prio;
@@ -3962,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
 	 */
 	p->rt_priority = attr->sched_priority;
 	p->normal_prio = normal_prio(p);
-	set_load_weight(p);
+	set_load_weight(p, true);
 }
 
 /* Actually do priority change: must hold pi & rq lock. */
@@ -5933,7 +5941,7 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
-	set_load_weight(&init_task);
+	set_load_weight(&init_task, false);
 
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 750ae4dbf812..d8c02b31498d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2776,6 +2776,43 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+			    unsigned long weight)
+{
+	if (se->on_rq) {
+		/* commit outstanding execution time */
+		if (cfs_rq->curr == se)
+			update_curr(cfs_rq);
+		account_entity_dequeue(cfs_rq, se);
+		dequeue_runnable_load_avg(cfs_rq, se);
+	}
+	dequeue_load_avg(cfs_rq, se);
+
+	update_load_set(&se->load, weight);
+
+#ifdef CONFIG_SMP
+	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
+				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+#endif
+
+	enqueue_load_avg(cfs_rq, se);
+	if (se->on_rq) {
+		account_entity_enqueue(cfs_rq, se);
+		enqueue_runnable_load_avg(cfs_rq, se);
+	}
+}
+
+void reweight_task(struct task_struct *p, int prio)
+{
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	struct load_weight *load = &se->load;
+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
+
+	reweight_entity(cfs_rq, se, weight);
+	load->inv_weight = sched_prio_to_wmult[prio];
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 # ifdef CONFIG_SMP
 /*
@@ -2878,32 +2915,6 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 }
 # endif /* CONFIG_SMP */
 
-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			    unsigned long weight)
-{
-	if (se->on_rq) {
-		/* commit outstanding execution time */
-		if (cfs_rq->curr == se)
-			update_curr(cfs_rq);
-		account_entity_dequeue(cfs_rq, se);
-		dequeue_runnable_load_avg(cfs_rq, se);
-	}
-	dequeue_load_avg(cfs_rq, se);
-
-	update_load_set(&se->load, weight);
-
-#ifdef CONFIG_SMP
-	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
-#endif
-
-	enqueue_load_avg(cfs_rq, se);
-	if (se->on_rq) {
-		account_entity_enqueue(cfs_rq, se);
-		enqueue_runnable_load_avg(cfs_rq, se);
-	}
-}
-
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
 static void update_cfs_shares(struct sched_entity *se)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76cd496f..a5d97460ee4e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1529,6 +1529,8 @@ extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
 
+extern void reweight_task(struct task_struct *p, int prio);
+
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);
 
-- 
cgit v1.3-14-g43fede


From 2a2f5d4e44ed160a5ed822c94e04f918f9fbb487 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 May 2017 16:51:41 +0200
Subject: sched/fair: Rewrite cfs_rq->removed_*avg

Since on wakeup migration we don't hold the rq->lock for the old CPU
we cannot update its state. Instead we add the removed 'load' to an
atomic variable and have the next update on that CPU collect and
process it.

Currently we have 2 atomic variables; which already have the issue
that they can be read out-of-sync. Also, two atomic ops on a single
cacheline is already more expensive than an uncontended lock.

Since we want to add more, convert the thing over to an explicit
cacheline with a lock in.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c |  8 ++++----
 kernel/sched/fair.c  | 51 +++++++++++++++++++++++++++++++++++----------------
 kernel/sched/sched.h | 13 +++++++++----
 3 files changed, 48 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..2f22342c48ff 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -564,10 +564,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
-			atomic_long_read(&cfs_rq->removed_load_avg));
-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
-			atomic_long_read(&cfs_rq->removed_util_avg));
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
+			cfs_rq->removed.load_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
+			cfs_rq->removed.util_avg);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d8c02b31498d..fe4a66b10480 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3501,36 +3501,47 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
+	unsigned long removed_load = 0, removed_util = 0;
 	struct sched_avg *sa = &cfs_rq->avg;
-	int decayed, removed_load = 0, removed_util = 0;
+	int decayed = 0;
 
-	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+	if (cfs_rq->removed.nr) {
+		unsigned long r;
+
+		raw_spin_lock(&cfs_rq->removed.lock);
+		swap(cfs_rq->removed.util_avg, removed_util);
+		swap(cfs_rq->removed.load_avg, removed_load);
+		cfs_rq->removed.nr = 0;
+		raw_spin_unlock(&cfs_rq->removed.lock);
+
+		/*
+		 * The LOAD_AVG_MAX for _sum is a slight over-estimate,
+		 * which is safe due to sub_positive() clipping at 0.
+		 */
+		r = removed_load;
 		sub_positive(&sa->load_avg, r);
 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
-		removed_load = 1;
-		set_tg_cfs_propagate(cfs_rq);
-	}
 
-	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
-		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+		r = removed_util;
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
-		removed_util = 1;
+
 		set_tg_cfs_propagate(cfs_rq);
+
+		decayed = 1;
 	}
 
-	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
+	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
 
 #ifndef CONFIG_64BIT
 	smp_wmb();
 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
-	if (decayed || removed_util)
+	if (decayed)
 		cfs_rq_util_change(cfs_rq);
 
-	return decayed || removed_load;
+	return decayed;
 }
 
 /**
@@ -3645,6 +3656,7 @@ void sync_entity_load_avg(struct sched_entity *se)
 void remove_entity_load_avg(struct sched_entity *se)
 {
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+	unsigned long flags;
 
 	/*
 	 * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3654,11 +3666,19 @@ void remove_entity_load_avg(struct sched_entity *se)
 	 * Similarly for groups, they will have passed through
 	 * post_init_entity_util_avg() before unregister_sched_fair_group()
 	 * calls this.
+	 *
+	 * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING
+	 * we could actually get the right time, since we're called with
+	 * rq->lock held, see detach_task().
 	 */
 
 	sync_entity_load_avg(se);
-	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
-	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
+	++cfs_rq->removed.nr;
+	cfs_rq->removed.util_avg	+= se->avg.util_avg;
+	cfs_rq->removed.load_avg	+= se->avg.load_avg;
+	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
@@ -9449,8 +9469,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	cfs_rq->propagate_avg = 0;
 #endif
-	atomic_long_set(&cfs_rq->removed_load_avg, 0);
-	atomic_long_set(&cfs_rq->removed_util_avg, 0);
+	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a5d97460ee4e..2fd350a12bb7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -445,14 +445,19 @@ struct cfs_rq {
 	struct sched_avg avg;
 	u64 runnable_load_sum;
 	unsigned long runnable_load_avg;
+#ifndef CONFIG_64BIT
+	u64 load_last_update_time_copy;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	unsigned long tg_load_avg_contrib;
 	unsigned long propagate_avg;
 #endif
-	atomic_long_t removed_load_avg, removed_util_avg;
-#ifndef CONFIG_64BIT
-	u64 load_last_update_time_copy;
-#endif
+	struct {
+		raw_spinlock_t	lock ____cacheline_aligned;
+		int		nr;
+		unsigned long	load_avg;
+		unsigned long	util_avg;
+	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
-- 
cgit v1.3-14-g43fede


From 0e2d2aaaae52c247c047d14999b93486bdbd3431 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 May 2017 17:30:46 +0200
Subject: sched/fair: Rewrite PELT migration propagation

When an entity migrates in (or out) of a runqueue, we need to add (or
remove) its contribution from the entire PELT hierarchy, because even
non-runnable entities are included in the load average sums.

In order to do this we have some propagation logic that updates the
PELT tree, however the way it 'propagates' the runnable (or load)
change is (more or less):

                     tg->weight * grq->avg.load_avg
  ge->avg.load_avg = ------------------------------
                               tg->load_avg

But that is the expression for ge->weight, and per the definition of
load_avg:

  ge->avg.load_avg := ge->weight * ge->avg.runnable_avg

That destroys the runnable_avg (by setting it to 1) we wanted to
propagate.

Instead directly propagate runnable_sum.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/debug.c |   2 +
 kernel/sched/fair.c  | 186 +++++++++++++++++++++++++++++----------------------
 kernel/sched/sched.h |   9 +--
 3 files changed, 112 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f22342c48ff..2e039a81864c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -568,6 +568,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->removed.load_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
 			cfs_rq->removed.util_avg);
+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
+			cfs_rq->removed.runnable_sum);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe4a66b10480..086a5d979720 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3319,11 +3319,77 @@ void set_task_rq_fair(struct sched_entity *se,
 	se->avg.last_update_time = n_last_update_time;
 }
 
-/* Take into account change of utilization of a child task group */
+
+/*
+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
+ * propagate its contribution. The key to this propagation is the invariant
+ * that for each group:
+ *
+ *   ge->avg == grq->avg						(1)
+ *
+ * _IFF_ we look at the pure running and runnable sums. Because they
+ * represent the very same entity, just at different points in the hierarchy.
+ *
+ *
+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
+ * simply copies the running sum over.
+ *
+ * However, update_tg_cfs_runnable() is more complex. So we have:
+ *
+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
+ *
+ * And since, like util, the runnable part should be directly transferable,
+ * the following would _appear_ to be the straight forward approach:
+ *
+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg	(3)
+ *
+ * And per (1) we have:
+ *
+ *   ge->avg.running_avg == grq->avg.running_avg
+ *
+ * Which gives:
+ *
+ *                      ge->load.weight * grq->avg.load_avg
+ *   ge->avg.load_avg = -----------------------------------		(4)
+ *                               grq->load.weight
+ *
+ * Except that is wrong!
+ *
+ * Because while for entities historical weight is not important and we
+ * really only care about our future and therefore can consider a pure
+ * runnable sum, runqueues can NOT do this.
+ *
+ * We specifically want runqueues to have a load_avg that includes
+ * historical weights. Those represent the blocked load, the load we expect
+ * to (shortly) return to us. This only works by keeping the weights as
+ * integral part of the sum. We therefore cannot decompose as per (3).
+ *
+ * OK, so what then?
+ *
+ *
+ * Another way to look at things is:
+ *
+ *   grq->avg.load_avg = \Sum se->avg.load_avg
+ *
+ * Therefore, per (2):
+ *
+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
+ *
+ * And the very thing we're propagating is a change in that sum (someone
+ * joined/left). So we can easily know the runnable change, which would be, per
+ * (2) the already tracked se->load_avg divided by the corresponding
+ * se->weight.
+ *
+ * Basically (4) but in differential form:
+ *
+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
+ *								   (5)
+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
+ */
+
 static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
 
 	/* Nothing to update */
@@ -3339,102 +3405,59 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
 }
 
-/* Take into account change of load of a child task group */
 static inline void
-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
-	long delta, load = gcfs_rq->avg.load_avg;
+	long runnable_sum = gcfs_rq->prop_runnable_sum;
+	long load_avg;
+	s64 load_sum;
 
-	/*
-	 * If the load of group cfs_rq is null, the load of the
-	 * sched_entity will also be null so we can skip the formula
-	 */
-	if (load) {
-		long tg_load;
-
-		/* Get tg's load and ensure tg_load > 0 */
-		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
-
-		/* Ensure tg_load >= load and updated with current load*/
-		tg_load -= gcfs_rq->tg_load_avg_contrib;
-		tg_load += load;
-
-		/*
-		 * We need to compute a correction term in the case that the
-		 * task group is consuming more CPU than a task of equal
-		 * weight. A task with a weight equals to tg->shares will have
-		 * a load less or equal to scale_load_down(tg->shares).
-		 * Similarly, the sched_entities that represent the task group
-		 * at parent level, can't have a load higher than
-		 * scale_load_down(tg->shares). And the Sum of sched_entities'
-		 * load must be <= scale_load_down(tg->shares).
-		 */
-		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
-			/* scale gcfs_rq's load into tg's shares*/
-			load *= scale_load_down(gcfs_rq->tg->shares);
-			load /= tg_load;
-		}
-	}
+	if (!runnable_sum)
+		return;
 
-	delta = load - se->avg.load_avg;
+	gcfs_rq->prop_runnable_sum = 0;
 
-	/* Nothing to update */
-	if (!delta)
-		return;
+	load_sum = (s64)se_weight(se) * runnable_sum;
+	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
 
-	/* Set new sched_entity's load */
-	se->avg.load_avg = load;
-	se->avg.load_sum = LOAD_AVG_MAX;
+	add_positive(&se->avg.load_sum, runnable_sum);
+	add_positive(&se->avg.load_avg, load_avg);
 
-	/* Update parent cfs_rq load */
-	add_positive(&cfs_rq->avg.load_avg, delta);
-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
+	add_positive(&cfs_rq->avg.load_avg, load_avg);
+	add_positive(&cfs_rq->avg.load_sum, load_sum);
 
-	/*
-	 * If the sched_entity is already enqueued, we also have to update the
-	 * runnable load avg.
-	 */
 	if (se->on_rq) {
-		/* Update parent cfs_rq runnable_load_avg */
-		add_positive(&cfs_rq->runnable_load_avg, delta);
-		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
+		add_positive(&cfs_rq->runnable_load_avg, load_avg);
+		add_positive(&cfs_rq->runnable_load_sum, load_sum);
 	}
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
 {
-	cfs_rq->propagate_avg = 1;
-}
-
-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
-{
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-
-	if (!cfs_rq->propagate_avg)
-		return 0;
-
-	cfs_rq->propagate_avg = 0;
-	return 1;
+	cfs_rq->propagate = 1;
+	cfs_rq->prop_runnable_sum += runnable_sum;
 }
 
 /* Update task and its cfs_rq load average */
 static inline int propagate_entity_load_avg(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq;
+	struct cfs_rq *cfs_rq, *gcfs_rq;
 
 	if (entity_is_task(se))
 		return 0;
 
-	if (!test_and_clear_tg_cfs_propagate(se))
+	gcfs_rq = group_cfs_rq(se);
+	if (!gcfs_rq->propagate)
 		return 0;
 
+	gcfs_rq->propagate = 0;
+
 	cfs_rq = cfs_rq_of(se);
 
-	set_tg_cfs_propagate(cfs_rq);
+	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 
-	update_tg_cfs_util(cfs_rq, se);
-	update_tg_cfs_load(cfs_rq, se);
+	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
 
 	return 1;
 }
@@ -3458,7 +3481,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 	 * If there is a pending propagation, we have to update the load and
 	 * the utilization of the sched_entity:
 	 */
-	if (gcfs_rq->propagate_avg)
+	if (gcfs_rq->propagate)
 		return false;
 
 	/*
@@ -3478,7 +3501,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
 	return 0;
 }
 
-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
 
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -3501,7 +3524,7 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 {
-	unsigned long removed_load = 0, removed_util = 0;
+	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
 	struct sched_avg *sa = &cfs_rq->avg;
 	int decayed = 0;
 
@@ -3511,6 +3534,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
 		swap(cfs_rq->removed.load_avg, removed_load);
+		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
 		cfs_rq->removed.nr = 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
 
@@ -3526,7 +3550,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		sub_positive(&sa->util_avg, r);
 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
 
-		set_tg_cfs_propagate(cfs_rq);
+		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
 		decayed = 1;
 	}
@@ -3558,7 +3582,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
-	set_tg_cfs_propagate(cfs_rq);
+
+	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3576,7 +3601,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 	dequeue_load_avg(cfs_rq, se);
 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
-	set_tg_cfs_propagate(cfs_rq);
+
+	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
 
 	cfs_rq_util_change(cfs_rq);
 }
@@ -3678,6 +3704,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 	++cfs_rq->removed.nr;
 	cfs_rq->removed.util_avg	+= se->avg.util_avg;
 	cfs_rq->removed.load_avg	+= se->avg.load_avg;
+	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
 	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
 }
 
@@ -9466,9 +9493,6 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
 #ifdef CONFIG_SMP
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->propagate_avg = 0;
-#endif
 	raw_spin_lock_init(&cfs_rq->removed.lock);
 #endif
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 2fd350a12bb7..5bcb86eb026b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -447,19 +447,20 @@ struct cfs_rq {
 	unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	unsigned long tg_load_avg_contrib;
-	unsigned long propagate_avg;
 #endif
 	struct {
 		raw_spinlock_t	lock ____cacheline_aligned;
 		int		nr;
 		unsigned long	load_avg;
 		unsigned long	util_avg;
+		unsigned long	runnable_sum;
 	} removed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	unsigned long tg_load_avg_contrib;
+	long propagate;
+	long prop_runnable_sum;
+
 	/*
 	 *   h_load = weight * f(tg)
 	 *
-- 
cgit v1.3-14-g43fede


From 1ea6c46a23f1213d1972bfae220db5c165e27bba Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 6 May 2017 15:59:54 +0200
Subject: sched/fair: Propagate an effective runnable_load_avg

The load balancer uses runnable_load_avg as load indicator. For
!cgroup this is:

  runnable_load_avg = \Sum se->avg.load_avg ; where se->on_rq

That is, a direct sum of all runnable tasks on that runqueue. As
opposed to load_avg, which is a sum of all tasks on the runqueue,
which includes a blocked component.

However, in the cgroup case, this comes apart since the group entities
are always runnable, even if most of their constituent entities are
blocked.

Therefore introduce a runnable_weight which for task entities is the
same as the regular weight, but for group entities is a fraction of
the entity weight and represents the runnable part of the group
runqueue.

Then propagate this load through the PELT hierarchy to arrive at an
effective runnable load avgerage -- which we should not confuse with
the canonical runnable load average.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |   3 +
 kernel/sched/debug.c  |   8 ++-
 kernel/sched/fair.c   | 172 +++++++++++++++++++++++++++++++++-----------------
 kernel/sched/sched.h  |   3 +-
 4 files changed, 124 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..bdd6ad6fcce1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -331,9 +331,11 @@ struct load_weight {
 struct sched_avg {
 	u64				last_update_time;
 	u64				load_sum;
+	u64				runnable_load_sum;
 	u32				util_sum;
 	u32				period_contrib;
 	unsigned long			load_avg;
+	unsigned long			runnable_load_avg;
 	unsigned long			util_avg;
 };
 
@@ -376,6 +378,7 @@ struct sched_statistics {
 struct sched_entity {
 	/* For load-balancing: */
 	struct load_weight		load;
+	unsigned long			runnable_weight;
 	struct rb_node			run_node;
 	struct list_head		group_node;
 	unsigned int			on_rq;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2e039a81864c..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 		P_SCHEDSTAT(se->statistics.wait_count);
 	}
 	P(se->load.weight);
+	P(se->runnable_weight);
 #ifdef CONFIG_SMP
 	P(se->avg.load_avg);
 	P(se->avg.util_avg);
+	P(se->avg.runnable_load_avg);
 #endif
 
 #undef PN_SCHEDSTAT
@@ -558,10 +560,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SMP
+	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
 	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
 			cfs_rq->avg.load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
-			cfs_rq->runnable_load_avg);
+			cfs_rq->avg.runnable_load_avg);
 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
 			cfs_rq->avg.util_avg);
 	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
@@ -1006,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 		   "nr_involuntary_switches", (long long)p->nivcsw);
 
 	P(se.load.weight);
+	P(se.runnable_weight);
 #ifdef CONFIG_SMP
 	P(se.avg.load_sum);
+	P(se.avg.runnable_load_sum);
 	P(se.avg.util_sum);
 	P(se.avg.load_avg);
+	P(se.avg.runnable_load_avg);
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 086a5d979720..10d2000fca2d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -730,8 +730,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 * nothing has been attached to the task group yet.
 	 */
 	if (entity_is_task(se))
-		sa->load_avg = scale_load_down(se->load.weight);
-	sa->load_sum = LOAD_AVG_MAX;
+		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+	sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
+
 	/*
 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
 	 */
@@ -2731,25 +2732,35 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 #ifdef CONFIG_SMP
 /*
- * XXX we want to get rid of this helper and use the full load resolution.
+ * XXX we want to get rid of these helpers and use the full load resolution.
  */
 static inline long se_weight(struct sched_entity *se)
 {
 	return scale_load_down(se->load.weight);
 }
 
+static inline long se_runnable(struct sched_entity *se)
+{
+	return scale_load_down(se->runnable_weight);
+}
+
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	cfs_rq->runnable_load_avg += se->avg.load_avg;
-	cfs_rq->runnable_load_sum += se_weight(se) * se->avg.load_sum;
+	cfs_rq->runnable_weight += se->runnable_weight;
+
+	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
 }
 
 static inline void
 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	sub_positive(&cfs_rq->runnable_load_avg, se->avg.load_avg);
-	sub_positive(&cfs_rq->runnable_load_sum, se_weight(se) * se->avg.load_sum);
+	cfs_rq->runnable_weight -= se->runnable_weight;
+
+	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+	sub_positive(&cfs_rq->avg.runnable_load_sum,
+		     se_runnable(se) * se->avg.runnable_load_sum);
 }
 
 static inline void
@@ -2777,7 +2788,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-			    unsigned long weight)
+			    unsigned long weight, unsigned long runnable)
 {
 	if (se->on_rq) {
 		/* commit outstanding execution time */
@@ -2788,11 +2799,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	}
 	dequeue_load_avg(cfs_rq, se);
 
+	se->runnable_weight = runnable;
 	update_load_set(&se->load, weight);
 
 #ifdef CONFIG_SMP
-	se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum,
-				   LOAD_AVG_MAX - 1024 + se->avg.period_contrib);
+	do {
+		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
+
+		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
+		se->avg.runnable_load_avg =
+			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
+	} while (0);
 #endif
 
 	enqueue_load_avg(cfs_rq, se);
@@ -2809,7 +2826,7 @@ void reweight_task(struct task_struct *p, int prio)
 	struct load_weight *load = &se->load;
 	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
 
-	reweight_entity(cfs_rq, se, weight);
+	reweight_entity(cfs_rq, se, weight, weight);
 	load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -2917,31 +2934,45 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct sched_entity *se)
+/*
+ * Recomputes the group entity based on the current state of its group
+ * runqueue.
+ */
+static void update_cfs_group(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
-	long shares;
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	long shares, runnable;
 
-	if (!cfs_rq)
+	if (!gcfs_rq)
 		return;
 
-	if (throttled_hierarchy(cfs_rq))
+	if (throttled_hierarchy(gcfs_rq))
 		return;
 
 #ifndef CONFIG_SMP
-	shares = READ_ONCE(cfs_rq->tg->shares);
+	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
 
 	if (likely(se->load.weight == shares))
 		return;
 #else
-	shares = calc_cfs_shares(cfs_rq);
+	shares = calc_cfs_shares(gcfs_rq);
+	/*
+	 * The hierarchical runnable load metric is the proportional part
+	 * of this group's runnable_load_avg / load_avg.
+	 *
+	 * Note: we need to deal with very sporadic 'runnable > load' cases
+	 * due to numerical instability.
+	 */
+	runnable = shares * gcfs_rq->avg.runnable_load_avg;
+	if (runnable)
+		runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);
 #endif
 
-	reweight_entity(cfs_rq_of(se), se, shares);
+	reweight_entity(cfs_rq_of(se), se, shares, runnable);
 }
 
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct sched_entity *se)
+static inline void update_cfs_group(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3050,7 +3081,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
  */
 static __always_inline u32
 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
+	       unsigned long load, unsigned long runnable, int running)
 {
 	unsigned long scale_freq, scale_cpu;
 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
@@ -3067,10 +3098,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	 */
 	if (periods) {
 		sa->load_sum = decay_load(sa->load_sum, periods);
-		if (cfs_rq) {
-			cfs_rq->runnable_load_sum =
-				decay_load(cfs_rq->runnable_load_sum, periods);
-		}
+		sa->runnable_load_sum =
+			decay_load(sa->runnable_load_sum, periods);
 		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 
 		/*
@@ -3083,11 +3112,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 	sa->period_contrib = delta;
 
 	contrib = cap_scale(contrib, scale_freq);
-	if (weight) {
-		sa->load_sum += weight * contrib;
-		if (cfs_rq)
-			cfs_rq->runnable_load_sum += weight * contrib;
-	}
+	if (load)
+		sa->load_sum += load * contrib;
+	if (runnable)
+		sa->runnable_load_sum += runnable * contrib;
 	if (running)
 		sa->util_sum += contrib * scale_cpu;
 
@@ -3124,7 +3152,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
  */
 static __always_inline int
 ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-		   unsigned long weight, int running, struct cfs_rq *cfs_rq)
+		  unsigned long load, unsigned long runnable, int running)
 {
 	u64 delta;
 
@@ -3157,8 +3185,8 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * this happens during idle_balance() which calls
 	 * update_blocked_averages()
 	 */
-	if (!weight)
-		running = 0;
+	if (!load)
+		runnable = running = 0;
 
 	/*
 	 * Now we know we crossed measurement unit boundaries. The *_avg
@@ -3167,45 +3195,60 @@ ___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
 	 * crossed period boundaries, finish.
 	 */
-	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
+	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
 		return 0;
 
 	return 1;
 }
 
 static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long weight, struct cfs_rq *cfs_rq)
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
 {
 	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
 	/*
 	 * Step 2: update *_avg.
 	 */
-	sa->load_avg = div_u64(weight * sa->load_sum, divider);
-	if (cfs_rq) {
-		cfs_rq->runnable_load_avg =
-			div_u64(cfs_rq->runnable_load_sum, divider);
-	}
+	sa->load_avg = div_u64(load * sa->load_sum, divider);
+	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
 	sa->util_avg = sa->util_sum / divider;
 }
 
 /*
  * sched_entity:
  *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
  *   load_sum := runnable_sum
  *   load_avg = se_weight(se) * runnable_avg
  *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
  * cfq_rs:
  *
  *   load_sum = \Sum se_weight(se) * se->avg.load_sum
  *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
  */
 
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, 0, 0, NULL)) {
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3215,10 +3258,13 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq,
-				cfs_rq->curr == se, NULL)) {
+	if (entity_is_task(se))
+		se->runnable_weight = se->load.weight;
+
+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+				cfs_rq->curr == se)) {
 
-		___update_load_avg(&se->avg, se_weight(se), NULL);
+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
 		return 1;
 	}
 
@@ -3230,8 +3276,10 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
 {
 	if (___update_load_sum(now, cpu, &cfs_rq->avg,
 				scale_load_down(cfs_rq->load.weight),
-				cfs_rq->curr != NULL, cfs_rq)) {
-		___update_load_avg(&cfs_rq->avg, 1, cfs_rq);
+				scale_load_down(cfs_rq->runnable_weight),
+				cfs_rq->curr != NULL)) {
+
+		___update_load_avg(&cfs_rq->avg, 1, 1);
 		return 1;
 	}
 
@@ -3409,8 +3457,8 @@ static inline void
 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
 {
 	long runnable_sum = gcfs_rq->prop_runnable_sum;
-	long load_avg;
-	s64 load_sum;
+	long runnable_load_avg, load_avg;
+	s64 runnable_load_sum, load_sum;
 
 	if (!runnable_sum)
 		return;
@@ -3426,9 +3474,15 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
 	add_positive(&cfs_rq->avg.load_avg, load_avg);
 	add_positive(&cfs_rq->avg.load_sum, load_sum);
 
+	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
+	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
+
+	add_positive(&se->avg.runnable_load_sum, runnable_sum);
+	add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
+
 	if (se->on_rq) {
-		add_positive(&cfs_rq->runnable_load_avg, load_avg);
-		add_positive(&cfs_rq->runnable_load_sum, load_sum);
+		add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
+		add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
 	}
 }
 
@@ -3710,7 +3764,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 
 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
 {
-	return cfs_rq->runnable_load_avg;
+	return cfs_rq->avg.runnable_load_avg;
 }
 
 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
@@ -3882,8 +3936,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 *   - Add its new weight to cfs_rq->load.weight
 	 */
 	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+	update_cfs_group(se);
 	enqueue_runnable_load_avg(cfs_rq, se);
-	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
 
 	if (flags & ENQUEUE_WAKEUP)
@@ -3989,7 +4043,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(se);
+	update_cfs_group(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -4172,7 +4226,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(cfs_rq, curr, UPDATE_TG);
-	update_cfs_shares(curr);
+	update_cfs_group(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -5090,7 +5144,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -5149,7 +5203,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(cfs_rq, se, UPDATE_TG);
-		update_cfs_shares(se);
+		update_cfs_group(se);
 	}
 
 	if (!se)
@@ -7174,7 +7228,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 	if (cfs_rq->avg.util_sum)
 		return false;
 
-	if (cfs_rq->runnable_load_sum)
+	if (cfs_rq->avg.runnable_load_sum)
 		return false;
 
 	return true;
@@ -9692,7 +9746,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 		update_rq_clock(rq);
 		for_each_sched_entity(se) {
 			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
-			update_cfs_shares(se);
+			update_cfs_group(se);
 		}
 		rq_unlock_irqrestore(rq, &rf);
 	}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5bcb86eb026b..e83d1b8be611 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -418,6 +418,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
 	struct load_weight load;
+	unsigned long runnable_weight;
 	unsigned int nr_running, h_nr_running;
 
 	u64 exec_clock;
@@ -443,8 +444,6 @@ struct cfs_rq {
 	 * CFS load tracking
 	 */
 	struct sched_avg avg;
-	u64 runnable_load_sum;
-	unsigned long runnable_load_avg;
 #ifndef CONFIG_64BIT
 	u64 load_last_update_time_copy;
 #endif
-- 
cgit v1.3-14-g43fede


From 144d8487bc6e9b741895709cb46d4e19b748a725 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 11 May 2017 17:57:24 +0200
Subject: sched/fair: Implement synchonous PELT detach on load-balance migrate

Vincent wondered why his self migrating task had a roughly 50% dip in
load_avg when landing on the new CPU. This is because we uncondionally
take the asynchronous detatch_entity route, which can lead to the
attach on the new CPU still seeing the old CPU's contribution to
tg->load_avg, effectively halving the new CPU's shares.

While in general this is something we have to live with, there is the
special case of runnable migration where we can do better.

Tested-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10d2000fca2d..92dbcc0fea46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3746,10 +3746,6 @@ void remove_entity_load_avg(struct sched_entity *se)
 	 * Similarly for groups, they will have passed through
 	 * post_init_entity_util_avg() before unregister_sched_fair_group()
 	 * calls this.
-	 *
-	 * XXX in case entity_is_task(se) && task_of(se)->on_rq == MIGRATING
-	 * we could actually get the right time, since we're called with
-	 * rq->lock held, see detach_task().
 	 */
 
 	sync_entity_load_avg(se);
@@ -6292,6 +6288,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 	return new_cpu;
 }
 
+static void detach_entity_cfs_rq(struct sched_entity *se);
+
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -6325,14 +6323,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
 		se->vruntime -= min_vruntime;
 	}
 
-	/*
-	 * We are supposed to update the task to "current" time, then its up to date
-	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
-	 * what current time is, so simply throw away the out-of-date time. This
-	 * will result in the wakee task is less decayed, but giving the wakee more
-	 * load sounds not bad.
-	 */
-	remove_entity_load_avg(&p->se);
+	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
+		/*
+		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
+		 * rq->lock and can modify state directly.
+		 */
+		lockdep_assert_held(&task_rq(p)->lock);
+		detach_entity_cfs_rq(&p->se);
+
+	} else {
+		/*
+		 * We are supposed to update the task to "current" time, then
+		 * its up to date and ready to go to new CPU/cfs_rq. But we
+		 * have difficulty in getting what current time is, so simply
+		 * throw away the out-of-date time. This will result in the
+		 * wakee task is less decayed, but giving the wakee more load
+		 * sounds not bad.
+		 */
+		remove_entity_load_avg(&p->se);
+	}
 
 	/* Tell new CPU we are migrated */
 	p->se.avg.last_update_time = 0;
-- 
cgit v1.3-14-g43fede


From f207934fb79d1af1de1a62b09d56a3a1914172c4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 12 May 2017 14:16:30 +0200
Subject: sched/fair: Align PELT windows between cfs_rq and its se

The PELT _sum values are a saw-tooth function, dropping on the decay
edge and then growing back up again during the window.

When these window-edges are not aligned between cfs_rq and se, we can
have the situation where, for example, on dequeue, the se decays
first.

Its _sum values will be small(er), while the cfs_rq _sum values will
still be on their way up. Because of this, the subtraction:
cfs_rq->avg._sum -= se->avg._sum will result in a positive value. This
will then, once the cfs_rq reaches an edge, translate into its _avg
value jumping up.

This is especially visible with the runnable_load bits, since they get
added/subtracted a lot.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 92dbcc0fea46..954b332cd899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -716,13 +716,8 @@ void init_entity_runnable_average(struct sched_entity *se)
 {
 	struct sched_avg *sa = &se->avg;
 
-	sa->last_update_time = 0;
-	/*
-	 * sched_avg's period_contrib should be strictly less then 1024, so
-	 * we give it 1023 to make sure it is almost a period (1024us), and
-	 * will definitely be update (after enqueue).
-	 */
-	sa->period_contrib = 1023;
+	memset(sa, 0, sizeof(*sa));
+
 	/*
 	 * Tasks are intialized with full load to be seen as heavy tasks until
 	 * they get a chance to stabilize to their real load level.
@@ -731,13 +726,9 @@ void init_entity_runnable_average(struct sched_entity *se)
 	 */
 	if (entity_is_task(se))
 		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
-	sa->runnable_load_sum = sa->load_sum = LOAD_AVG_MAX;
 
-	/*
-	 * At this point, util_avg won't be used in select_task_rq_fair anyway
-	 */
-	sa->util_avg = 0;
-	sa->util_sum = 0;
+	se->runnable_weight = se->load.weight;
+
 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
 
@@ -785,7 +776,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 		} else {
 			sa->util_avg = cap;
 		}
-		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
 	}
 
 	if (entity_is_task(se)) {
@@ -3632,7 +3622,34 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  */
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
+	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
+
+	/*
+	 * When we attach the @se to the @cfs_rq, we must align the decay
+	 * window because without that, really weird and wonderful things can
+	 * happen.
+	 *
+	 * XXX illustrate
+	 */
 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
+	se->avg.period_contrib = cfs_rq->avg.period_contrib;
+
+	/*
+	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
+	 * period_contrib. This isn't strictly correct, but since we're
+	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
+	 * _sum a little.
+	 */
+	se->avg.util_sum = se->avg.util_avg * divider;
+
+	se->avg.load_sum = divider;
+	if (se_weight(se)) {
+		se->avg.load_sum =
+			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
+	}
+
+	se->avg.runnable_load_sum = se->avg.load_sum;
+
 	enqueue_load_avg(cfs_rq, se);
 	cfs_rq->avg.util_avg += se->avg.util_avg;
 	cfs_rq->avg.util_sum += se->avg.util_sum;
-- 
cgit v1.3-14-g43fede


From 9a2dd585b2c431ec1e5d46a9d9568291c7a534cc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 12 May 2017 14:18:10 +0200
Subject: sched/fair: Implement more accurate async detach

The problem with the overestimate is that it will subtract too big a
value from the load_sum, thereby pushing it down further than it ought
to go. Since runnable_load_avg is not subject to a similar 'force',
this results in the occasional 'runnable_load > load' situation.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 954b332cd899..67c39642a512 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3574,6 +3574,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
 	if (cfs_rq->removed.nr) {
 		unsigned long r;
+		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
 
 		raw_spin_lock(&cfs_rq->removed.lock);
 		swap(cfs_rq->removed.util_avg, removed_util);
@@ -3582,17 +3583,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 		cfs_rq->removed.nr = 0;
 		raw_spin_unlock(&cfs_rq->removed.lock);
 
-		/*
-		 * The LOAD_AVG_MAX for _sum is a slight over-estimate,
-		 * which is safe due to sub_positive() clipping at 0.
-		 */
 		r = removed_load;
 		sub_positive(&sa->load_avg, r);
-		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
+		sub_positive(&sa->load_sum, r * divider);
 
 		r = removed_util;
 		sub_positive(&sa->util_avg, r);
-		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+		sub_positive(&sa->util_sum, r * divider);
 
 		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
 
-- 
cgit v1.3-14-g43fede


From 2c8e4dce7963d2bae02db95fce2691365630685c Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Thu, 3 Aug 2017 11:13:39 -0400
Subject: sched/fair: Calculate runnable_weight slightly differently

Our runnable_weight currently looks like this

runnable_weight = shares * runnable_load_avg / load_avg

The goal is to scale the runnable weight for the group based on its runnable to
load_avg ratio.  The problem with this is it biases us towards tasks that never
go to sleep.  Tasks that go to sleep are going to have their runnable_load_avg
decayed pretty hard, which will drastically reduce the runnable weight of groups
with interactive tasks.  To solve this imbalance we tweak this slightly, so in
the ideal case it is still the above, but in the interactive case it is

runnable_weight = shares * runnable_weight / load_weight

which will make the weight distribution fairer between interactive and
non-interactive groups.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-team@fb.com
Cc: linux-kernel@vger.kernel.org
Cc: riel@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1501773219-18774-2-git-send-email-jbacik@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 67c39642a512..a62098ec9deb 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2883,7 +2883,7 @@ void reweight_task(struct task_struct *p, int prio)
  *
  * hence icky!
  */
-static long calc_cfs_shares(struct cfs_rq *cfs_rq)
+static long calc_group_shares(struct cfs_rq *cfs_rq)
 {
 	long tg_weight, tg_shares, load, shares;
 	struct task_group *tg = cfs_rq->tg;
@@ -2920,6 +2920,36 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq)
 	 */
 	return clamp_t(long, shares, MIN_SHARES, tg_shares);
 }
+
+/*
+ * The runnable shares of this group are calculated as such
+ *
+ *          max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight)
+ * shares * ------------------------------------------------------------
+ *               max(cfs_rq->avg.load_avg, cfs_rq->load.weight)
+ *
+ * We do this to keep the shares in line with expected load on the cfs_rq.
+ * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first
+ * time, it's runnable_load_avg is not going to be representative of the actual
+ * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq.
+ * The weight on the cfs_rq is the immediate effect of having new tasks
+ * enqueue'd onto it which should be used to calculate the new runnable shares.
+ * At the same time we need the actual load_avg to be the lower bounds for the
+ * calculation, to handle when our weight drops quickly from having entities
+ * dequeued.
+ */
+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
+{
+	long load_avg = max(cfs_rq->avg.load_avg,
+			    scale_load_down(cfs_rq->load.weight));
+	long runnable = max(cfs_rq->avg.runnable_load_avg,
+			    scale_load_down(cfs_rq->runnable_weight));
+
+	runnable *= shares;
+	if (load_avg)
+		runnable /= load_avg;
+	return clamp_t(long, runnable, MIN_SHARES, shares);
+}
 # endif /* CONFIG_SMP */
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -2945,17 +2975,8 @@ static void update_cfs_group(struct sched_entity *se)
 	if (likely(se->load.weight == shares))
 		return;
 #else
-	shares = calc_cfs_shares(gcfs_rq);
-	/*
-	 * The hierarchical runnable load metric is the proportional part
-	 * of this group's runnable_load_avg / load_avg.
-	 *
-	 * Note: we need to deal with very sporadic 'runnable > load' cases
-	 * due to numerical instability.
-	 */
-	runnable = shares * gcfs_rq->avg.runnable_load_avg;
-	if (runnable)
-		runnable /= max(gcfs_rq->avg.load_avg, gcfs_rq->avg.runnable_load_avg);
+	shares   = calc_group_shares(gcfs_rq);
+	runnable = calc_group_runnable(gcfs_rq, shares);
 #endif
 
 	reweight_entity(cfs_rq_of(se), se, shares, runnable);
-- 
cgit v1.3-14-g43fede


From 17de4ee04ca925590df036b112c1db8a778e14bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 24 Aug 2017 13:06:35 +0200
Subject: sched/fair: Update calc_group_*() comments

I had a wee bit of trouble recalling how the calc_group_runnable()
stuff worked.. add hopefully better comments.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 66 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a62098ec9deb..350dbec01523 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2860,7 +2860,7 @@ void reweight_task(struct task_struct *p, int prio)
  * Now, in that special case (1) reduces to:
  *
  *                     tg->weight * grq->load.weight
- *   ge->load.weight = ----------------------------- = tg>weight   (4)
+ *   ge->load.weight = ----------------------------- = tg->weight   (4)
  *			    grp->load.weight
  *
  * That is, the sum collapses because all other CPUs are idle; the UP scenario.
@@ -2874,6 +2874,18 @@ void reweight_task(struct task_struct *p, int prio)
  *     ---------------------------------------------------         (5)
  *     tg->load_avg - grq->avg.load_avg + grq->load.weight
  *
+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
+ *
+ *
+ *                     tg->weight * grq->load.weight
+ *   ge->load.weight = -----------------------------		   (6)
+ *				tg_load_avg'
+ *
+ * Where:
+ *
+ *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
+ *                  max(grq->load.weight, grq->avg.load_avg)
  *
  * And that is shares_weight and is icky. In the (near) UP case it approaches
  * (4) while in the normal case it approaches (3). It consistently
@@ -2890,10 +2902,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 
 	tg_shares = READ_ONCE(tg->shares);
 
-	/*
-	 * Because (5) drops to 0 when the cfs_rq is idle, we need to use (3)
-	 * as a lower bound.
-	 */
 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
 	tg_weight = atomic_long_read(&tg->load_avg);
@@ -2922,32 +2930,46 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 }
 
 /*
- * The runnable shares of this group are calculated as such
+ * This calculates the effective runnable weight for a group entity based on
+ * the group entity weight calculated above.
+ *
+ * Because of the above approximation (2), our group entity weight is
+ * an load_avg based ratio (3). This means that it includes blocked load and
+ * does not represent the runnable weight.
  *
- *          max(cfs_rq->avg.runnable_load_avg, cfs_rq->runnable_weight)
- * shares * ------------------------------------------------------------
- *               max(cfs_rq->avg.load_avg, cfs_rq->load.weight)
+ * Approximate the group entity's runnable weight per ratio from the group
+ * runqueue:
  *
- * We do this to keep the shares in line with expected load on the cfs_rq.
- * Consider a cfs_rq that has several tasks wake up on this cfs_rq for the first
- * time, it's runnable_load_avg is not going to be representative of the actual
- * load this cfs_rq will now experience, which will bias us agaisnt this cfs_rq.
- * The weight on the cfs_rq is the immediate effect of having new tasks
- * enqueue'd onto it which should be used to calculate the new runnable shares.
- * At the same time we need the actual load_avg to be the lower bounds for the
- * calculation, to handle when our weight drops quickly from having entities
- * dequeued.
+ *					     grq->avg.runnable_load_avg
+ *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
+ *						 grq->avg.load_avg
+ *
+ * However, analogous to above, since the avg numbers are slow, this leads to
+ * transients in the from-idle case. Instead we use:
+ *
+ *   ge->runnable_weight = ge->load.weight *
+ *
+ *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
+ *		-----------------------------------------------------	(8)
+ *		      max(grq->avg.load_avg, grq->load.weight)
+ *
+ * Where these max() serve both to use the 'instant' values to fix the slow
+ * from-idle and avoid the /0 on to-idle, similar to (6).
  */
 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
 {
-	long load_avg = max(cfs_rq->avg.load_avg,
-			    scale_load_down(cfs_rq->load.weight));
-	long runnable = max(cfs_rq->avg.runnable_load_avg,
-			    scale_load_down(cfs_rq->runnable_weight));
+	long runnable, load_avg;
+
+	load_avg = max(cfs_rq->avg.load_avg,
+		       scale_load_down(cfs_rq->load.weight));
+
+	runnable = max(cfs_rq->avg.runnable_load_avg,
+		       scale_load_down(cfs_rq->runnable_weight));
 
 	runnable *= shares;
 	if (load_avg)
 		runnable /= load_avg;
+
 	return clamp_t(long, runnable, MIN_SHARES, shares);
 }
 # endif /* CONFIG_SMP */
-- 
cgit v1.3-14-g43fede


From a1f7164c7b8b0d46f63bfb4ca0bb5971c760b921 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 09:00:18 -0700
Subject: sched: Misc preps for cgroup unified hierarchy interface

Make the following changes in preparation for the cpu controller
interface implementation for cgroup2.  This patch doesn't cause any
functional differences.

* s/cpu_stats_show()/cpu_cfs_stat_show()/

* s/cpu_files/cpu_legacy_files/

v2: Dropped cpuacct changes as it won't be used by cpu controller
    interface anymore.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/sched/core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..6815fa424a7a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6599,7 +6599,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6639,7 +6639,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -6660,7 +6660,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.seq_show = cpu_stats_show,
+		.seq_show = cpu_cfs_stat_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -6686,7 +6686,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_files,
 	.early_init	= true,
 };
 
-- 
cgit v1.3-14-g43fede


From 0d5936344f30aba0f6ddb92b030cb6a05168efe6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 25 Sep 2017 09:00:19 -0700
Subject: sched: Implement interface for cgroup unified hierarchy

There are a couple interface issues which can be addressed in cgroup2
interface.

* Stats from cpuacct being reported separately from the cpu stats.

* Use of different time units.  Writable control knobs use
  microseconds, some stat fields use nanoseconds while other cpuacct
  stat fields use centiseconds.

* Control knobs which can't be used in the root cgroup still show up
  in the root.

* Control knob names and semantics aren't consistent with other
  controllers.

This patchset implements cpu controller's interface on cgroup2 which
adheres to the controller file conventions described in
Documentation/cgroups/cgroup-v2.txt.  Overall, the following changes
are made.

* cpuacct is implictly enabled and disabled by cpu and its information
  is reported through "cpu.stat" which now uses microseconds for all
  time durations.  All time duration fields now have "_usec" appended
  to them for clarity.

  Note that cpuacct.usage_percpu is currently not included in
  "cpu.stat".  If this information is actually called for, it will be
  added later.

* "cpu.shares" is replaced with "cpu.weight" and operates on the
  standard scale defined by CGROUP_WEIGHT_MIN/DFL/MAX (1, 100, 10000).
  The weight is scaled to scheduler weight so that 100 maps to 1024
  and the ratio relationship is preserved - if weight is W and its
  scaled value is S, W / 100 == S / 1024.  While the mapped range is a
  bit smaller than the orignal scheduler weight range, the dead zones
  on both sides are relatively small and covers wider range than the
  nice value mappings.  This file doesn't make sense in the root
  cgroup and isn't created on root.

* "cpu.weight.nice" is added. When read, it reads back the nice value
  which is closest to the current "cpu.weight".  When written, it sets
  "cpu.weight" to the weight value which matches the nice value.  This
  makes it easy to configure cgroups when they're competing against
  threads in threaded subtrees.

* "cpu.cfs_quota_us" and "cpu.cfs_period_us" are replaced by "cpu.max"
  which contains both quota and period.

v4: - Use cgroup2 basic usage stat as the information source instead
      of cpuacct.

v3: - Added "cpu.weight.nice" to allow using nice values when
      configuring the weight.  The feature is requested by PeterZ.
    - Merge the patch to enable threaded support on cpu and cpuacct.
    - Dropped the bits about getting rid of cpuacct from patch
      description as there is a pretty strong case for making cpuacct
      an implicit controller so that basic cpu usage stats are always
      available.
    - Documentation updated accordingly.  "cpu.rt.max" section is
      dropped for now.

v2: - cpu_stats_show() was incorrectly using CONFIG_FAIR_GROUP_SCHED
      for CFS bandwidth stats and also using raw division for u64.
      Use CONFIG_CFS_BANDWITH and do_div() instead.  "cpu.rt.max" is
      not included yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 Documentation/cgroup-v2.txt |  36 ++++------
 kernel/sched/core.c         | 171 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 183 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 3f8216912df0..0bbdc720dd7c 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -902,10 +902,6 @@ Controllers
 CPU
 ---
 
-.. note::
-
-	The interface for the cpu controller hasn't been merged yet
-
 The "cpu" controllers regulates distribution of CPU cycles.  This
 controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
@@ -935,6 +931,18 @@ All time durations are in microseconds.
 
 	The weight in the range [1, 10000].
 
+  cpu.weight.nice
+	A read-write single value file which exists on non-root
+	cgroups.  The default is "0".
+
+	The nice value is in the range [-20, 19].
+
+	This interface file is an alternative interface for
+	"cpu.weight" and allows reading and setting weight using the
+	same values used by nice(2).  Because the range is smaller and
+	granularity is coarser for the nice values, the read value is
+	the closest approximation of the current weight.
+
   cpu.max
 	A read-write two value file which exists on non-root cgroups.
 	The default is "max 100000".
@@ -947,26 +955,6 @@ All time durations are in microseconds.
 	$PERIOD duration.  "max" for $MAX indicates no limit.  If only
 	one number is written, $MAX is updated.
 
-  cpu.rt.max
-	.. note::
-
-	   The semantics of this file is still under discussion and the
-	   interface hasn't been merged yet
-
-	A read-write two value file which exists on all cgroups.
-	The default is "0 100000".
-
-	The maximum realtime runtime allocation.  Over-committing
-	configurations are disallowed and process migrations are
-	rejected if not enough bandwidth is available.  It's in the
-	following format::
-
-	  $MAX $PERIOD
-
-	which indicates that the group may consume upto $MAX in each
-	$PERIOD duration.  If only one number is written, $MAX is
-	updated.
-
 
 Memory
 ------
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6815fa424a7a..ad255162a830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,6 +6678,175 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
+static int cpu_stat_show(struct seq_file *sf, void *v)
+{
+	cgroup_stat_show_cputime(sf, "");
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		struct task_group *tg = css_tg(seq_css(sf));
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		u64 throttled_usec;
+
+		throttled_usec = cfs_b->throttled_time;
+		do_div(throttled_usec, NSEC_PER_USEC);
+
+		seq_printf(sf, "nr_periods %d\n"
+			   "nr_throttled %d\n"
+			   "throttled_usec %llu\n",
+			   cfs_b->nr_periods, cfs_b->nr_throttled,
+			   throttled_usec);
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+	u64 weight = scale_load_down(tg->shares);
+
+	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 weight)
+{
+	/*
+	 * cgroup weight knobs should use the common MIN, DFL and MAX
+	 * values which are 1, 100 and 10000 respectively.  While it loses
+	 * a bit of range on both ends, it maps pretty well onto the shares
+	 * value used by scheduler and the round-trip conversions preserve
+	 * the original value over the entire range.
+	 */
+	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+		return -ERANGE;
+
+	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	unsigned long weight = scale_load_down(css_tg(css)->shares);
+	int last_delta = INT_MAX;
+	int prio, delta;
+
+	/* find the closest nice value to the current weight */
+	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+		delta = abs(sched_prio_to_weight[prio] - weight);
+		if (delta >= last_delta)
+			break;
+		last_delta = delta;
+	}
+
+	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	unsigned long weight;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+						  long period, long quota)
+{
+	if (quota < 0)
+		seq_puts(sf, "max");
+	else
+		seq_printf(sf, "%ld", quota);
+
+	seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+						 u64 *periodp, u64 *quotap)
+{
+	char tok[21];	/* U64_MAX */
+
+	if (!sscanf(buf, "%s %llu", tok, periodp))
+		return -EINVAL;
+
+	*periodp *= NSEC_PER_USEC;
+
+	if (sscanf(tok, "%llu", quotap))
+		*quotap *= NSEC_PER_USEC;
+	else if (!strcmp(tok, "max"))
+		*quotap = RUNTIME_INF;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+	return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	u64 period = tg_get_cfs_period(tg);
+	u64 quota;
+	int ret;
+
+	ret = cpu_period_quota_parse(buf, &period, &quota);
+	if (!ret)
+		ret = tg_set_cfs_bandwidth(tg, period, quota);
+	return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+	{
+		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_weight_read_u64,
+		.write_u64 = cpu_weight_write_u64,
+	},
+	{
+		.name = "weight.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_weight_nice_read_s64,
+		.write_s64 = cpu_weight_nice_write_s64,
+	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_max_show,
+		.write = cpu_max_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
@@ -6687,7 +6856,9 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.legacy_cftypes	= cpu_legacy_files,
+	.dfl_cftypes	= cpu_files,
 	.early_init	= true,
+	.threaded	= true,
 };
 
 #endif	/* CONFIG_CGROUP_SCHED */
-- 
cgit v1.3-14-g43fede


From 721e08dad17e226ef68819d0a23dc53c25fe8ea5 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Fri, 29 Sep 2017 10:52:17 -0700
Subject: bpf: Fix compiler warning on info.map_ids for 32bit platform

This patch uses u64_to_user_ptr() to cast info.map_ids to a userspace ptr.
It also tags the user_map_ids with '__user' for sparse check.

Fixes: cb4d2b3f03d8 ("bpf: Add name, load_time, uid and map_ids to bpf_prog_info")
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 11a7f82a55d1..b927da66f653 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1405,7 +1405,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.nr_map_ids = prog->aux->used_map_cnt;
 	ulen = min_t(u32, info.nr_map_ids, ulen);
 	if (ulen) {
-		u32 *user_map_ids = (u32 *)info.map_ids;
+		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
 		u32 i;
 
 		for (i = 0; i < ulen; i++)
-- 
cgit v1.3-14-g43fede


From 7813dd6fc75fb375d4caf002e7f80a826fc3153a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 26 Sep 2017 15:12:40 -0700
Subject: PM / OPP: Move the OPP directory out of power/

The drivers/base/power/ directory is special and contains code related
to power management core like system suspend/resume, hibernation, etc.
It was fine to keep the OPP code inside it when we had just one file for
it, but it is growing now and already has a directory for itself.

Lets move it directly under drivers/ directory, just like cpufreq and
cpuidle.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                      |    2 +-
 drivers/Kconfig                  |    2 +
 drivers/Makefile                 |    1 +
 drivers/base/power/Makefile      |    1 -
 drivers/base/power/opp/Makefile  |    4 -
 drivers/base/power/opp/core.c    | 1747 --------------------------------------
 drivers/base/power/opp/cpu.c     |  236 -----
 drivers/base/power/opp/debugfs.c |  249 ------
 drivers/base/power/opp/of.c      |  633 --------------
 drivers/base/power/opp/opp.h     |  222 -----
 drivers/opp/Kconfig              |   13 +
 drivers/opp/Makefile             |    4 +
 drivers/opp/core.c               | 1747 ++++++++++++++++++++++++++++++++++++++
 drivers/opp/cpu.c                |  236 +++++
 drivers/opp/debugfs.c            |  249 ++++++
 drivers/opp/of.c                 |  633 ++++++++++++++
 drivers/opp/opp.h                |  222 +++++
 kernel/power/Kconfig             |   14 -
 18 files changed, 3108 insertions(+), 3107 deletions(-)
 delete mode 100644 drivers/base/power/opp/Makefile
 delete mode 100644 drivers/base/power/opp/core.c
 delete mode 100644 drivers/base/power/opp/cpu.c
 delete mode 100644 drivers/base/power/opp/debugfs.c
 delete mode 100644 drivers/base/power/opp/of.c
 delete mode 100644 drivers/base/power/opp/opp.h
 create mode 100644 drivers/opp/Kconfig
 create mode 100644 drivers/opp/Makefile
 create mode 100644 drivers/opp/core.c
 create mode 100644 drivers/opp/cpu.c
 create mode 100644 drivers/opp/debugfs.c
 create mode 100644 drivers/opp/of.c
 create mode 100644 drivers/opp/opp.h

(limited to 'kernel')

diff --git a/MAINTAINERS b/MAINTAINERS
index 65b0c88d5ee0..7c8c649fc68b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10043,7 +10043,7 @@ M:	Stephen Boyd <sboyd@codeaurora.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/vireshk/pm.git
-F:	drivers/base/power/opp/
+F:	drivers/opp/
 F:	include/linux/pm_opp.h
 F:	Documentation/power/opp.txt
 F:	Documentation/devicetree/bindings/opp/
diff --git a/drivers/Kconfig b/drivers/Kconfig
index 505c676fa9c7..9e264d410c23 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -208,4 +208,6 @@ source "drivers/tee/Kconfig"
 
 source "drivers/mux/Kconfig"
 
+source "drivers/opp/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index d90fdc413648..dd718a3007e9 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_ACCESSIBILITY)	+= accessibility/
 obj-$(CONFIG_ISDN)		+= isdn/
 obj-$(CONFIG_EDAC)		+= edac/
 obj-$(CONFIG_EISA)		+= eisa/
+obj-$(CONFIG_PM_OPP)		+= opp/
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_CPU_IDLE)		+= cpuidle/
 obj-y				+= mmc/
diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile
index 5998c53280f5..73a1cffc0a5f 100644
--- a/drivers/base/power/Makefile
+++ b/drivers/base/power/Makefile
@@ -1,7 +1,6 @@
 obj-$(CONFIG_PM)	+= sysfs.o generic_ops.o common.o qos.o runtime.o wakeirq.o
 obj-$(CONFIG_PM_SLEEP)	+= main.o wakeup.o
 obj-$(CONFIG_PM_TRACE_RTC)	+= trace.o
-obj-$(CONFIG_PM_OPP)	+= opp/
 obj-$(CONFIG_PM_GENERIC_DOMAINS)	+=  domain.o domain_governor.o
 obj-$(CONFIG_HAVE_CLK)	+= clock_ops.o
 
diff --git a/drivers/base/power/opp/Makefile b/drivers/base/power/opp/Makefile
deleted file mode 100644
index e70ceb406fe9..000000000000
--- a/drivers/base/power/opp/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
-obj-y				+= core.o cpu.o
-obj-$(CONFIG_OF)		+= of.o
-obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
deleted file mode 100644
index a6de32530693..000000000000
--- a/drivers/base/power/opp/core.c
+++ /dev/null
@@ -1,1747 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/clk.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/export.h>
-#include <linux/regulator/consumer.h>
-
-#include "opp.h"
-
-/*
- * The root of the list of all opp-tables. All opp_table structures branch off
- * from here, with each opp_table containing the list of opps it supports in
- * various states of availability.
- */
-LIST_HEAD(opp_tables);
-/* Lock to allow exclusive modification to the device and opp lists */
-DEFINE_MUTEX(opp_table_lock);
-
-static void dev_pm_opp_get(struct dev_pm_opp *opp);
-
-static struct opp_device *_find_opp_dev(const struct device *dev,
-					struct opp_table *opp_table)
-{
-	struct opp_device *opp_dev;
-
-	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
-		if (opp_dev->dev == dev)
-			return opp_dev;
-
-	return NULL;
-}
-
-static struct opp_table *_find_opp_table_unlocked(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	list_for_each_entry(opp_table, &opp_tables, node) {
-		if (_find_opp_dev(dev, opp_table)) {
-			_get_opp_table_kref(opp_table);
-
-			return opp_table;
-		}
-	}
-
-	return ERR_PTR(-ENODEV);
-}
-
-/**
- * _find_opp_table() - find opp_table struct using device pointer
- * @dev:	device pointer used to lookup OPP table
- *
- * Search OPP table for one containing matching device.
- *
- * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
- * -EINVAL based on type of error.
- *
- * The callers must call dev_pm_opp_put_opp_table() after the table is used.
- */
-struct opp_table *_find_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	if (IS_ERR_OR_NULL(dev)) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return ERR_PTR(-EINVAL);
-	}
-
-	mutex_lock(&opp_table_lock);
-	opp_table = _find_opp_table_unlocked(dev);
-	mutex_unlock(&opp_table_lock);
-
-	return opp_table;
-}
-
-/**
- * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
- * @opp:	opp for which voltage has to be returned for
- *
- * Return: voltage in micro volt corresponding to the opp, else
- * return 0
- *
- * This is useful only for devices with single power supply.
- */
-unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp)) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return 0;
-	}
-
-	return opp->supplies[0].u_volt;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
-
-/**
- * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
- * @opp:	opp for which frequency has to be returned for
- *
- * Return: frequency in hertz corresponding to the opp, else
- * return 0
- */
-unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp) || !opp->available) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return 0;
-	}
-
-	return opp->rate;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
-
-/**
- * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
- * @opp: opp for which turbo mode is being verified
- *
- * Turbo OPPs are not for normal use, and can be enabled (under certain
- * conditions) for short duration of times to finish high throughput work
- * quickly. Running on them for longer times may overheat the chip.
- *
- * Return: true if opp is turbo opp, else false.
- */
-bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
-{
-	if (IS_ERR_OR_NULL(opp) || !opp->available) {
-		pr_err("%s: Invalid parameters\n", __func__);
-		return false;
-	}
-
-	return opp->turbo;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
-
-/**
- * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the max clock latency in nanoseconds.
- */
-unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
-{
-	struct opp_table *opp_table;
-	unsigned long clock_latency_ns;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	clock_latency_ns = opp_table->clock_latency_ns_max;
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return clock_latency_ns;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
-
-/**
- * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
- * @dev: device for which we do this operation
- *
- * Return: This function returns the max voltage latency in nanoseconds.
- */
-unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *opp;
-	struct regulator *reg;
-	unsigned long latency_ns = 0;
-	int ret, i, count;
-	struct {
-		unsigned long min;
-		unsigned long max;
-	} *uV;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	count = opp_table->regulator_count;
-
-	/* Regulator may not be required for the device */
-	if (!count)
-		goto put_opp_table;
-
-	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
-	if (!uV)
-		goto put_opp_table;
-
-	mutex_lock(&opp_table->lock);
-
-	for (i = 0; i < count; i++) {
-		uV[i].min = ~0;
-		uV[i].max = 0;
-
-		list_for_each_entry(opp, &opp_table->opp_list, node) {
-			if (!opp->available)
-				continue;
-
-			if (opp->supplies[i].u_volt_min < uV[i].min)
-				uV[i].min = opp->supplies[i].u_volt_min;
-			if (opp->supplies[i].u_volt_max > uV[i].max)
-				uV[i].max = opp->supplies[i].u_volt_max;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	/*
-	 * The caller needs to ensure that opp_table (and hence the regulator)
-	 * isn't freed, while we are executing this routine.
-	 */
-	for (i = 0; i < count; i++) {
-		reg = opp_table->regulators[i];
-		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
-		if (ret > 0)
-			latency_ns += ret * 1000;
-	}
-
-	kfree(uV);
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return latency_ns;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
-
-/**
- * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
- *					     nanoseconds
- * @dev: device for which we do this operation
- *
- * Return: This function returns the max transition latency, in nanoseconds, to
- * switch from one OPP to other.
- */
-unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
-{
-	return dev_pm_opp_get_max_volt_latency(dev) +
-		dev_pm_opp_get_max_clock_latency(dev);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
-
-/**
- * dev_pm_opp_get_suspend_opp_freq() - Get frequency of suspend opp in Hz
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the frequency of the OPP marked as suspend_opp
- * if one is available, else returns 0;
- */
-unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
-{
-	struct opp_table *opp_table;
-	unsigned long freq = 0;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return 0;
-
-	if (opp_table->suspend_opp && opp_table->suspend_opp->available)
-		freq = dev_pm_opp_get_freq(opp_table->suspend_opp);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return freq;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp_freq);
-
-/**
- * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
- * @dev:	device for which we do this operation
- *
- * Return: This function returns the number of available opps if there are any,
- * else returns 0 if none or the corresponding error value.
- */
-int dev_pm_opp_get_opp_count(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp;
-	int count = 0;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		count = PTR_ERR(opp_table);
-		dev_err(dev, "%s: OPP table not found (%d)\n",
-			__func__, count);
-		return count;
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available)
-			count++;
-	}
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return count;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
-
-/**
- * dev_pm_opp_find_freq_exact() - search for an exact frequency
- * @dev:		device for which we do this operation
- * @freq:		frequency to search for
- * @available:		true/false - match for available opp
- *
- * Return: Searches for exact match in the opp table and returns pointer to the
- * matching opp if found, else returns ERR_PTR in case of error and should
- * be handled using IS_ERR. Error return values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * Note: available is a modifier for the search. if available=true, then the
- * match is for exact matching frequency and is available in the stored OPP
- * table. if false, the match is for exact frequency which is not available.
- *
- * This provides a mechanism to enable an opp which is not available currently
- * or the opposite as well.
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
-					      unsigned long freq,
-					      bool available)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		int r = PTR_ERR(opp_table);
-
-		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
-		return ERR_PTR(r);
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available == available &&
-				temp_opp->rate == freq) {
-			opp = temp_opp;
-
-			/* Increment the reference count of OPP */
-			dev_pm_opp_get(opp);
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
-
-static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
-						   unsigned long *freq)
-{
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available && temp_opp->rate >= *freq) {
-			opp = temp_opp;
-			*freq = opp->rate;
-
-			/* Increment the reference count of OPP */
-			dev_pm_opp_get(opp);
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	return opp;
-}
-
-/**
- * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching ceil *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
-					     unsigned long *freq)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *opp;
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	opp = _find_freq_ceil(opp_table, freq);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
-
-/**
- * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
- * @dev:	device for which we do this operation
- * @freq:	Start frequency
- *
- * Search for the matching floor *available* OPP from a starting freq
- * for a device.
- *
- * Return: matching *opp and refreshes *freq accordingly, else returns
- * ERR_PTR in case of error and should be handled using IS_ERR. Error return
- * values can be:
- * EINVAL:	for bad pointer
- * ERANGE:	no match found for search
- * ENODEV:	if device not found in list of registered devices
- *
- * The callers are required to call dev_pm_opp_put() for the returned OPP after
- * use.
- */
-struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
-					      unsigned long *freq)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
-
-	if (!dev || !freq) {
-		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
-		return ERR_PTR(-EINVAL);
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return ERR_CAST(opp_table);
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
-		if (temp_opp->available) {
-			/* go to the next node, before choosing prev */
-			if (temp_opp->rate > *freq)
-				break;
-			else
-				opp = temp_opp;
-		}
-	}
-
-	/* Increment the reference count of OPP */
-	if (!IS_ERR(opp))
-		dev_pm_opp_get(opp);
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-
-	if (!IS_ERR(opp))
-		*freq = opp->rate;
-
-	return opp;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
-
-static int _set_opp_voltage(struct device *dev, struct regulator *reg,
-			    struct dev_pm_opp_supply *supply)
-{
-	int ret;
-
-	/* Regulator not available for device */
-	if (IS_ERR(reg)) {
-		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
-			PTR_ERR(reg));
-		return 0;
-	}
-
-	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
-		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
-
-	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
-					    supply->u_volt, supply->u_volt_max);
-	if (ret)
-		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
-			__func__, supply->u_volt_min, supply->u_volt,
-			supply->u_volt_max, ret);
-
-	return ret;
-}
-
-static inline int
-_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
-			  unsigned long old_freq, unsigned long freq)
-{
-	int ret;
-
-	ret = clk_set_rate(clk, freq);
-	if (ret) {
-		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
-			ret);
-	}
-
-	return ret;
-}
-
-static int _generic_set_opp_regulator(const struct opp_table *opp_table,
-				      struct device *dev,
-				      unsigned long old_freq,
-				      unsigned long freq,
-				      struct dev_pm_opp_supply *old_supply,
-				      struct dev_pm_opp_supply *new_supply)
-{
-	struct regulator *reg = opp_table->regulators[0];
-	int ret;
-
-	/* This function only supports single regulator per device */
-	if (WARN_ON(opp_table->regulator_count > 1)) {
-		dev_err(dev, "multiple regulators are not supported\n");
-		return -EINVAL;
-	}
-
-	/* Scaling up? Scale voltage before frequency */
-	if (freq > old_freq) {
-		ret = _set_opp_voltage(dev, reg, new_supply);
-		if (ret)
-			goto restore_voltage;
-	}
-
-	/* Change frequency */
-	ret = _generic_set_opp_clk_only(dev, opp_table->clk, old_freq, freq);
-	if (ret)
-		goto restore_voltage;
-
-	/* Scaling down? Scale voltage after frequency */
-	if (freq < old_freq) {
-		ret = _set_opp_voltage(dev, reg, new_supply);
-		if (ret)
-			goto restore_freq;
-	}
-
-	return 0;
-
-restore_freq:
-	if (_generic_set_opp_clk_only(dev, opp_table->clk, freq, old_freq))
-		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
-			__func__, old_freq);
-restore_voltage:
-	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (old_supply)
-		_set_opp_voltage(dev, reg, old_supply);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_set_rate() - Configure new OPP based on frequency
- * @dev:	 device for which we do this operation
- * @target_freq: frequency to achieve
- *
- * This configures the power-supplies and clock source to the levels specified
- * by the OPP corresponding to the target_freq.
- */
-int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
-{
-	struct opp_table *opp_table;
-	unsigned long freq, old_freq;
-	struct dev_pm_opp *old_opp, *opp;
-	struct clk *clk;
-	int ret, size;
-
-	if (unlikely(!target_freq)) {
-		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
-			target_freq);
-		return -EINVAL;
-	}
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
-		return PTR_ERR(opp_table);
-	}
-
-	clk = opp_table->clk;
-	if (IS_ERR(clk)) {
-		dev_err(dev, "%s: No clock available for the device\n",
-			__func__);
-		ret = PTR_ERR(clk);
-		goto put_opp_table;
-	}
-
-	freq = clk_round_rate(clk, target_freq);
-	if ((long)freq <= 0)
-		freq = target_freq;
-
-	old_freq = clk_get_rate(clk);
-
-	/* Return early if nothing to do */
-	if (old_freq == freq) {
-		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
-			__func__, freq);
-		ret = 0;
-		goto put_opp_table;
-	}
-
-	old_opp = _find_freq_ceil(opp_table, &old_freq);
-	if (IS_ERR(old_opp)) {
-		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
-			__func__, old_freq, PTR_ERR(old_opp));
-	}
-
-	opp = _find_freq_ceil(opp_table, &freq);
-	if (IS_ERR(opp)) {
-		ret = PTR_ERR(opp);
-		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
-			__func__, freq, ret);
-		goto put_old_opp;
-	}
-
-	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
-		old_freq, freq);
-
-	/* Only frequency scaling */
-	if (!opp_table->regulators) {
-		ret = _generic_set_opp_clk_only(dev, clk, old_freq, freq);
-	} else if (!opp_table->set_opp) {
-		ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq,
-						 IS_ERR(old_opp) ? NULL : old_opp->supplies,
-						 opp->supplies);
-	} else {
-		struct dev_pm_set_opp_data *data;
-
-		data = opp_table->set_opp_data;
-		data->regulators = opp_table->regulators;
-		data->regulator_count = opp_table->regulator_count;
-		data->clk = clk;
-		data->dev = dev;
-
-		data->old_opp.rate = old_freq;
-		size = sizeof(*opp->supplies) * opp_table->regulator_count;
-		if (IS_ERR(old_opp))
-			memset(data->old_opp.supplies, 0, size);
-		else
-			memcpy(data->old_opp.supplies, old_opp->supplies, size);
-
-		data->new_opp.rate = freq;
-		memcpy(data->new_opp.supplies, opp->supplies, size);
-
-		ret = opp_table->set_opp(data);
-	}
-
-	dev_pm_opp_put(opp);
-put_old_opp:
-	if (!IS_ERR(old_opp))
-		dev_pm_opp_put(old_opp);
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
-
-/* OPP-dev Helpers */
-static void _remove_opp_dev(struct opp_device *opp_dev,
-			    struct opp_table *opp_table)
-{
-	opp_debug_unregister(opp_dev, opp_table);
-	list_del(&opp_dev->node);
-	kfree(opp_dev);
-}
-
-struct opp_device *_add_opp_dev(const struct device *dev,
-				struct opp_table *opp_table)
-{
-	struct opp_device *opp_dev;
-	int ret;
-
-	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
-	if (!opp_dev)
-		return NULL;
-
-	/* Initialize opp-dev */
-	opp_dev->dev = dev;
-	list_add(&opp_dev->node, &opp_table->dev_list);
-
-	/* Create debugfs entries for the opp_table */
-	ret = opp_debug_register(opp_dev, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
-			__func__, ret);
-
-	return opp_dev;
-}
-
-static struct opp_table *_allocate_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-	struct opp_device *opp_dev;
-	int ret;
-
-	/*
-	 * Allocate a new OPP table. In the infrequent case where a new
-	 * device is needed to be added, we pay this penalty.
-	 */
-	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
-	if (!opp_table)
-		return NULL;
-
-	INIT_LIST_HEAD(&opp_table->dev_list);
-
-	opp_dev = _add_opp_dev(dev, opp_table);
-	if (!opp_dev) {
-		kfree(opp_table);
-		return NULL;
-	}
-
-	_of_init_opp_table(opp_table, dev);
-
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, NULL);
-	if (IS_ERR(opp_table->clk)) {
-		ret = PTR_ERR(opp_table->clk);
-		if (ret != -EPROBE_DEFER)
-			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
-				ret);
-	}
-
-	BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head);
-	INIT_LIST_HEAD(&opp_table->opp_list);
-	mutex_init(&opp_table->lock);
-	kref_init(&opp_table->kref);
-
-	/* Secure the device table modification */
-	list_add(&opp_table->node, &opp_tables);
-	return opp_table;
-}
-
-void _get_opp_table_kref(struct opp_table *opp_table)
-{
-	kref_get(&opp_table->kref);
-}
-
-struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
-{
-	struct opp_table *opp_table;
-
-	/* Hold our table modification lock here */
-	mutex_lock(&opp_table_lock);
-
-	opp_table = _find_opp_table_unlocked(dev);
-	if (!IS_ERR(opp_table))
-		goto unlock;
-
-	opp_table = _allocate_opp_table(dev);
-
-unlock:
-	mutex_unlock(&opp_table_lock);
-
-	return opp_table;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
-
-static void _opp_table_kref_release(struct kref *kref)
-{
-	struct opp_table *opp_table = container_of(kref, struct opp_table, kref);
-	struct opp_device *opp_dev;
-
-	/* Release clk */
-	if (!IS_ERR(opp_table->clk))
-		clk_put(opp_table->clk);
-
-	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
-				   node);
-
-	_remove_opp_dev(opp_dev, opp_table);
-
-	/* dev_list must be empty now */
-	WARN_ON(!list_empty(&opp_table->dev_list));
-
-	mutex_destroy(&opp_table->lock);
-	list_del(&opp_table->node);
-	kfree(opp_table);
-
-	mutex_unlock(&opp_table_lock);
-}
-
-void dev_pm_opp_put_opp_table(struct opp_table *opp_table)
-{
-	kref_put_mutex(&opp_table->kref, _opp_table_kref_release,
-		       &opp_table_lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_opp_table);
-
-void _opp_free(struct dev_pm_opp *opp)
-{
-	kfree(opp);
-}
-
-static void _opp_kref_release(struct kref *kref)
-{
-	struct dev_pm_opp *opp = container_of(kref, struct dev_pm_opp, kref);
-	struct opp_table *opp_table = opp->opp_table;
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_REMOVE, opp);
-	opp_debug_remove_one(opp);
-	list_del(&opp->node);
-	kfree(opp);
-
-	mutex_unlock(&opp_table->lock);
-	dev_pm_opp_put_opp_table(opp_table);
-}
-
-static void dev_pm_opp_get(struct dev_pm_opp *opp)
-{
-	kref_get(&opp->kref);
-}
-
-void dev_pm_opp_put(struct dev_pm_opp *opp)
-{
-	kref_put_mutex(&opp->kref, _opp_kref_release, &opp->opp_table->lock);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put);
-
-/**
- * dev_pm_opp_remove()  - Remove an OPP from OPP table
- * @dev:	device for which we do this operation
- * @freq:	OPP to remove with matching 'freq'
- *
- * This function removes an opp from the opp table.
- */
-void dev_pm_opp_remove(struct device *dev, unsigned long freq)
-{
-	struct dev_pm_opp *opp;
-	struct opp_table *opp_table;
-	bool found = false;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return;
-
-	mutex_lock(&opp_table->lock);
-
-	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		if (opp->rate == freq) {
-			found = true;
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table->lock);
-
-	if (found) {
-		dev_pm_opp_put(opp);
-	} else {
-		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
-			 __func__, freq);
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
-
-struct dev_pm_opp *_opp_allocate(struct opp_table *table)
-{
-	struct dev_pm_opp *opp;
-	int count, supply_size;
-
-	/* Allocate space for at least one supply */
-	count = table->regulator_count ? table->regulator_count : 1;
-	supply_size = sizeof(*opp->supplies) * count;
-
-	/* allocate new OPP node and supplies structures */
-	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
-	if (!opp)
-		return NULL;
-
-	/* Put the supplies at the end of the OPP structure as an empty array */
-	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
-	INIT_LIST_HEAD(&opp->node);
-
-	return opp;
-}
-
-static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
-					 struct opp_table *opp_table)
-{
-	struct regulator *reg;
-	int i;
-
-	for (i = 0; i < opp_table->regulator_count; i++) {
-		reg = opp_table->regulators[i];
-
-		if (!regulator_is_supported_voltage(reg,
-					opp->supplies[i].u_volt_min,
-					opp->supplies[i].u_volt_max)) {
-			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-				__func__, opp->supplies[i].u_volt_min,
-				opp->supplies[i].u_volt_max);
-			return false;
-		}
-	}
-
-	return true;
-}
-
-/*
- * Returns:
- * 0: On success. And appropriate error message for duplicate OPPs.
- * -EBUSY: For OPP with same freq/volt and is available. The callers of
- *  _opp_add() must return 0 if they receive -EBUSY from it. This is to make
- *  sure we don't print error messages unnecessarily if different parts of
- *  kernel try to initialize the OPP table.
- * -EEXIST: For OPP with same freq but different volt or is unavailable. This
- *  should be considered an error by the callers of _opp_add().
- */
-int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
-	     struct opp_table *opp_table)
-{
-	struct dev_pm_opp *opp;
-	struct list_head *head;
-	int ret;
-
-	/*
-	 * Insert new OPP in order of increasing frequency and discard if
-	 * already present.
-	 *
-	 * Need to use &opp_table->opp_list in the condition part of the 'for'
-	 * loop, don't replace it with head otherwise it will become an infinite
-	 * loop.
-	 */
-	mutex_lock(&opp_table->lock);
-	head = &opp_table->opp_list;
-
-	list_for_each_entry(opp, &opp_table->opp_list, node) {
-		if (new_opp->rate > opp->rate) {
-			head = &opp->node;
-			continue;
-		}
-
-		if (new_opp->rate < opp->rate)
-			break;
-
-		/* Duplicate OPPs */
-		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->supplies[0].u_volt,
-			 opp->available, new_opp->rate,
-			 new_opp->supplies[0].u_volt, new_opp->available);
-
-		/* Should we compare voltages for all regulators here ? */
-		ret = opp->available &&
-		      new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? -EBUSY : -EEXIST;
-
-		mutex_unlock(&opp_table->lock);
-		return ret;
-	}
-
-	list_add(&new_opp->node, head);
-	mutex_unlock(&opp_table->lock);
-
-	new_opp->opp_table = opp_table;
-	kref_init(&new_opp->kref);
-
-	/* Get a reference to the OPP table */
-	_get_opp_table_kref(opp_table);
-
-	ret = opp_debug_create_one(new_opp, opp_table);
-	if (ret)
-		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
-			__func__, ret);
-
-	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
-		new_opp->available = false;
-		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
-			 __func__, new_opp->rate);
-	}
-
-	return 0;
-}
-
-/**
- * _opp_add_v1() - Allocate a OPP based on v1 bindings.
- * @opp_table:	OPP table
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- * @dynamic:	Dynamically added OPPs.
- *
- * This function adds an opp definition to the opp table and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
- *
- * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
- * and freed by dev_pm_opp_of_remove_table.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
-		unsigned long freq, long u_volt, bool dynamic)
-{
-	struct dev_pm_opp *new_opp;
-	unsigned long tol;
-	int ret;
-
-	new_opp = _opp_allocate(opp_table);
-	if (!new_opp)
-		return -ENOMEM;
-
-	/* populate the opp table */
-	new_opp->rate = freq;
-	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
-	new_opp->supplies[0].u_volt = u_volt;
-	new_opp->supplies[0].u_volt_min = u_volt - tol;
-	new_opp->supplies[0].u_volt_max = u_volt + tol;
-	new_opp->available = true;
-	new_opp->dynamic = dynamic;
-
-	ret = _opp_add(dev, new_opp, opp_table);
-	if (ret) {
-		/* Don't return error for duplicate OPPs */
-		if (ret == -EBUSY)
-			ret = 0;
-		goto free_opp;
-	}
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
-	return 0;
-
-free_opp:
-	_opp_free(new_opp);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_set_supported_hw() - Set supported platforms
- * @dev: Device for which supported-hw has to be set.
- * @versions: Array of hierarchy of versions to match.
- * @count: Number of elements in the array.
- *
- * This is required only for the V2 bindings, and it enables a platform to
- * specify the hierarchy of versions it supports. OPP layer will then enable
- * OPPs, which are available for those versions, based on its 'opp-supported-hw'
- * property.
- */
-struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
-			const u32 *versions, unsigned int count)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	/* Do we already have a version hierarchy associated with opp_table? */
-	if (opp_table->supported_hw) {
-		dev_err(dev, "%s: Already have supported hardware list\n",
-			__func__);
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
-					GFP_KERNEL);
-	if (!opp_table->supported_hw) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	opp_table->supported_hw_count = count;
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
-
-/**
- * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
- * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
- *
- * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
- * will not be freed.
- */
-void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	if (!opp_table->supported_hw) {
-		pr_err("%s: Doesn't have supported hardware list\n",
-		       __func__);
-		return;
-	}
-
-	kfree(opp_table->supported_hw);
-	opp_table->supported_hw = NULL;
-	opp_table->supported_hw_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
-
-/**
- * dev_pm_opp_set_prop_name() - Set prop-extn name
- * @dev: Device for which the prop-name has to be set.
- * @name: name to postfix to properties.
- *
- * This is required only for the V2 bindings, and it enables a platform to
- * specify the extn to be used for certain property names. The properties to
- * which the extension will apply are opp-microvolt and opp-microamp. OPP core
- * should postfix the property name with -<name> while looking for them.
- */
-struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	/* Do we already have a prop-name associated with opp_table? */
-	if (opp_table->prop_name) {
-		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
-			opp_table->prop_name);
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
-	if (!opp_table->prop_name) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
-
-/**
- * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
- * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
- *
- * This is required only for the V2 bindings, and is called for a matching
- * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
- * will not be freed.
- */
-void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	if (!opp_table->prop_name) {
-		pr_err("%s: Doesn't have a prop-name\n", __func__);
-		return;
-	}
-
-	kfree(opp_table->prop_name);
-	opp_table->prop_name = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
-
-static int _allocate_set_opp_data(struct opp_table *opp_table)
-{
-	struct dev_pm_set_opp_data *data;
-	int len, count = opp_table->regulator_count;
-
-	if (WARN_ON(!count))
-		return -EINVAL;
-
-	/* space for set_opp_data */
-	len = sizeof(*data);
-
-	/* space for old_opp.supplies and new_opp.supplies */
-	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
-
-	data = kzalloc(len, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->old_opp.supplies = (void *)(data + 1);
-	data->new_opp.supplies = data->old_opp.supplies + count;
-
-	opp_table->set_opp_data = data;
-
-	return 0;
-}
-
-static void _free_set_opp_data(struct opp_table *opp_table)
-{
-	kfree(opp_table->set_opp_data);
-	opp_table->set_opp_data = NULL;
-}
-
-/**
- * dev_pm_opp_set_regulators() - Set regulator names for the device
- * @dev: Device for which regulator name is being set.
- * @names: Array of pointers to the names of the regulator.
- * @count: Number of regulators.
- *
- * In order to support OPP switching, OPP layer needs to know the name of the
- * device's regulators, as the core would be required to switch voltages as
- * well.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
-					    const char * const names[],
-					    unsigned int count)
-{
-	struct opp_table *opp_table;
-	struct regulator *reg;
-	int ret, i;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have regulators set */
-	if (opp_table->regulators) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->regulators = kmalloc_array(count,
-					      sizeof(*opp_table->regulators),
-					      GFP_KERNEL);
-	if (!opp_table->regulators) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < count; i++) {
-		reg = regulator_get_optional(dev, names[i]);
-		if (IS_ERR(reg)) {
-			ret = PTR_ERR(reg);
-			if (ret != -EPROBE_DEFER)
-				dev_err(dev, "%s: no regulator (%s) found: %d\n",
-					__func__, names[i], ret);
-			goto free_regulators;
-		}
-
-		opp_table->regulators[i] = reg;
-	}
-
-	opp_table->regulator_count = count;
-
-	/* Allocate block only once to pass to set_opp() routines */
-	ret = _allocate_set_opp_data(opp_table);
-	if (ret)
-		goto free_regulators;
-
-	return opp_table;
-
-free_regulators:
-	while (i != 0)
-		regulator_put(opp_table->regulators[--i]);
-
-	kfree(opp_table->regulators);
-	opp_table->regulators = NULL;
-	opp_table->regulator_count = 0;
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
-
-/**
- * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
- * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
- */
-void dev_pm_opp_put_regulators(struct opp_table *opp_table)
-{
-	int i;
-
-	if (!opp_table->regulators) {
-		pr_err("%s: Doesn't have regulators set\n", __func__);
-		return;
-	}
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	for (i = opp_table->regulator_count - 1; i >= 0; i--)
-		regulator_put(opp_table->regulators[i]);
-
-	_free_set_opp_data(opp_table);
-
-	kfree(opp_table->regulators);
-	opp_table->regulators = NULL;
-	opp_table->regulator_count = 0;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
-
-/**
- * dev_pm_opp_set_clkname() - Set clk name for the device
- * @dev: Device for which clk name is being set.
- * @name: Clk name.
- *
- * In order to support OPP switching, OPP layer needs to get pointer to the
- * clock for the device. Simple cases work fine without using this routine (i.e.
- * by passing connection-id as NULL), but for a device with multiple clocks
- * available, the OPP core needs to know the exact name of the clk to use.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have default clk set, free it */
-	if (!IS_ERR(opp_table->clk))
-		clk_put(opp_table->clk);
-
-	/* Find clk for the device */
-	opp_table->clk = clk_get(dev, name);
-	if (IS_ERR(opp_table->clk)) {
-		ret = PTR_ERR(opp_table->clk);
-		if (ret != -EPROBE_DEFER) {
-			dev_err(dev, "%s: Couldn't find clock: %d\n", __func__,
-				ret);
-		}
-		goto err;
-	}
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
-
-/**
- * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
- * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
- */
-void dev_pm_opp_put_clkname(struct opp_table *opp_table)
-{
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	clk_put(opp_table->clk);
-	opp_table->clk = ERR_PTR(-EINVAL);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
-
-/**
- * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
- * @dev: Device for which the helper is getting registered.
- * @set_opp: Custom set OPP helper.
- *
- * This is useful to support complex platforms (like platforms with multiple
- * regulators per device), instead of the generic OPP set rate helper.
- *
- * This must be called before any OPPs are initialized for the device.
- */
-struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
-			int (*set_opp)(struct dev_pm_set_opp_data *data))
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	if (!set_opp)
-		return ERR_PTR(-EINVAL);
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return ERR_PTR(-ENOMEM);
-
-	/* This should be called before OPPs are initialized */
-	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	/* Already have custom set_opp helper */
-	if (WARN_ON(opp_table->set_opp)) {
-		ret = -EBUSY;
-		goto err;
-	}
-
-	opp_table->set_opp = set_opp;
-
-	return opp_table;
-
-err:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ERR_PTR(ret);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
-
-/**
- * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
- *					   set_opp helper
- * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
- *
- * Release resources blocked for platform specific set_opp helper.
- */
-void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
-{
-	if (!opp_table->set_opp) {
-		pr_err("%s: Doesn't have custom set_opp helper set\n",
-		       __func__);
-		return;
-	}
-
-	/* Make sure there are no concurrent readers while updating opp_table */
-	WARN_ON(!list_empty(&opp_table->opp_list));
-
-	opp_table->set_opp = NULL;
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
-
-/**
- * dev_pm_opp_add()  - Add an OPP table from a table definitions
- * @dev:	device for which we do this operation
- * @freq:	Frequency in Hz for this OPP
- * @u_volt:	Voltage in uVolts for this OPP
- *
- * This function adds an opp definition to the opp table and returns status.
- * The opp is made available by default and it can be controlled using
- * dev_pm_opp_enable/disable functions.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- */
-int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
-
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_add);
-
-/**
- * _opp_set_availability() - helper to set the availability of an opp
- * @dev:		device for which we do this operation
- * @freq:		OPP frequency to modify availability
- * @availability_req:	availability status requested for this opp
- *
- * Set the availability of an OPP, opp_{enable,disable} share a common logic
- * which is isolated here.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-static int _opp_set_availability(struct device *dev, unsigned long freq,
-				 bool availability_req)
-{
-	struct opp_table *opp_table;
-	struct dev_pm_opp *tmp_opp, *opp = ERR_PTR(-ENODEV);
-	int r = 0;
-
-	/* Find the opp_table */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		r = PTR_ERR(opp_table);
-		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
-		return r;
-	}
-
-	mutex_lock(&opp_table->lock);
-
-	/* Do we have the frequency? */
-	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
-		if (tmp_opp->rate == freq) {
-			opp = tmp_opp;
-			break;
-		}
-	}
-
-	if (IS_ERR(opp)) {
-		r = PTR_ERR(opp);
-		goto unlock;
-	}
-
-	/* Is update really needed? */
-	if (opp->available == availability_req)
-		goto unlock;
-
-	opp->available = availability_req;
-
-	dev_pm_opp_get(opp);
-	mutex_unlock(&opp_table->lock);
-
-	/* Notify the change of the OPP availability */
-	if (availability_req)
-		blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
-					     opp);
-	else
-		blocking_notifier_call_chain(&opp_table->head,
-					     OPP_EVENT_DISABLE, opp);
-
-	dev_pm_opp_put(opp);
-	goto put_table;
-
-unlock:
-	mutex_unlock(&opp_table->lock);
-put_table:
-	dev_pm_opp_put_opp_table(opp_table);
-	return r;
-}
-
-/**
- * dev_pm_opp_enable() - Enable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to enable
- *
- * Enables a provided opp. If the operation is valid, this returns 0, else the
- * corresponding error value. It is meant to be used for users an OPP available
- * after being temporarily made unavailable with dev_pm_opp_disable.
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-int dev_pm_opp_enable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
-
-/**
- * dev_pm_opp_disable() - Disable a specific OPP
- * @dev:	device for which we do this operation
- * @freq:	OPP frequency to disable
- *
- * Disables a provided opp. If the operation is valid, this returns
- * 0, else the corresponding error value. It is meant to be a temporary
- * control by users to make this OPP not available until the circumstances are
- * right to make it available again (with a call to dev_pm_opp_enable).
- *
- * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modification was done OR modification was
- * successful.
- */
-int dev_pm_opp_disable(struct device *dev, unsigned long freq)
-{
-	return _opp_set_availability(dev, freq, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
-
-/**
- * dev_pm_opp_register_notifier() - Register OPP notifier for the device
- * @dev:	Device for which notifier needs to be registered
- * @nb:		Notifier block to be registered
- *
- * Return: 0 on success or a negative error value.
- */
-int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	ret = blocking_notifier_chain_register(&opp_table->head, nb);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL(dev_pm_opp_register_notifier);
-
-/**
- * dev_pm_opp_unregister_notifier() - Unregister OPP notifier for the device
- * @dev:	Device for which notifier needs to be unregistered
- * @nb:		Notifier block to be unregistered
- *
- * Return: 0 on success or a negative error value.
- */
-int dev_pm_opp_unregister_notifier(struct device *dev,
-				   struct notifier_block *nb)
-{
-	struct opp_table *opp_table;
-	int ret;
-
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	ret = blocking_notifier_chain_unregister(&opp_table->head, nb);
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
-
-/*
- * Free OPPs either created using static entries present in DT or even the
- * dynamically added entries based on remove_all param.
- */
-void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev,
-			      bool remove_all)
-{
-	struct dev_pm_opp *opp, *tmp;
-
-	/* Find if opp_table manages a single device */
-	if (list_is_singular(&opp_table->dev_list)) {
-		/* Free static OPPs */
-		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
-			if (remove_all || !opp->dynamic)
-				dev_pm_opp_put(opp);
-		}
-	} else {
-		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
-	}
-}
-
-void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all)
-{
-	struct opp_table *opp_table;
-
-	/* Check for existing table for 'dev' */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		int error = PTR_ERR(opp_table);
-
-		if (error != -ENODEV)
-			WARN(1, "%s: opp_table: %d\n",
-			     IS_ERR_OR_NULL(dev) ?
-					"Invalid device" : dev_name(dev),
-			     error);
-		return;
-	}
-
-	_dev_pm_opp_remove_table(opp_table, dev, remove_all);
-
-	dev_pm_opp_put_opp_table(opp_table);
-}
-
-/**
- * dev_pm_opp_remove_table() - Free all OPPs associated with the device
- * @dev:	device pointer used to lookup OPP table.
- *
- * Free both OPPs created using static entries present in DT and the
- * dynamically added entries.
- */
-void dev_pm_opp_remove_table(struct device *dev)
-{
-	_dev_pm_opp_find_and_remove_table(dev, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table);
diff --git a/drivers/base/power/opp/cpu.c b/drivers/base/power/opp/cpu.c
deleted file mode 100644
index 2d87bc1adf38..000000000000
--- a/drivers/base/power/opp/cpu.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Generic OPP helper interface for CPU device
- *
- * Copyright (C) 2009-2014 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cpu.h>
-#include <linux/cpufreq.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-
-#include "opp.h"
-
-#ifdef CONFIG_CPU_FREQ
-
-/**
- * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
- * @dev:	device for which we do this operation
- * @table:	Cpufreq table returned back to caller
- *
- * Generate a cpufreq table for a provided device- this assumes that the
- * opp table is already initialized and ready for usage.
- *
- * This function allocates required memory for the cpufreq table. It is
- * expected that the caller does the required maintenance such as freeing
- * the table as required.
- *
- * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
- * if no memory available for the operation (table is not populated), returns 0
- * if successful and table is populated.
- *
- * WARNING: It is  important for the callers to ensure refreshing their copy of
- * the table if any of the mentioned functions have been invoked in the interim.
- */
-int dev_pm_opp_init_cpufreq_table(struct device *dev,
-				  struct cpufreq_frequency_table **table)
-{
-	struct dev_pm_opp *opp;
-	struct cpufreq_frequency_table *freq_table = NULL;
-	int i, max_opps, ret = 0;
-	unsigned long rate;
-
-	max_opps = dev_pm_opp_get_opp_count(dev);
-	if (max_opps <= 0)
-		return max_opps ? max_opps : -ENODATA;
-
-	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
-	if (!freq_table)
-		return -ENOMEM;
-
-	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
-		/* find next rate */
-		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
-		if (IS_ERR(opp)) {
-			ret = PTR_ERR(opp);
-			goto out;
-		}
-		freq_table[i].driver_data = i;
-		freq_table[i].frequency = rate / 1000;
-
-		/* Is Boost/turbo opp ? */
-		if (dev_pm_opp_is_turbo(opp))
-			freq_table[i].flags = CPUFREQ_BOOST_FREQ;
-
-		dev_pm_opp_put(opp);
-	}
-
-	freq_table[i].driver_data = i;
-	freq_table[i].frequency = CPUFREQ_TABLE_END;
-
-	*table = &freq_table[0];
-
-out:
-	if (ret)
-		kfree(freq_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
-
-/**
- * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
- * @dev:	device for which we do this operation
- * @table:	table to free
- *
- * Free up the table allocated by dev_pm_opp_init_cpufreq_table
- */
-void dev_pm_opp_free_cpufreq_table(struct device *dev,
-				   struct cpufreq_frequency_table **table)
-{
-	if (!table)
-		return;
-
-	kfree(*table);
-	*table = NULL;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
-#endif	/* CONFIG_CPU_FREQ */
-
-void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of)
-{
-	struct device *cpu_dev;
-	int cpu;
-
-	WARN_ON(cpumask_empty(cpumask));
-
-	for_each_cpu(cpu, cpumask) {
-		cpu_dev = get_cpu_device(cpu);
-		if (!cpu_dev) {
-			pr_err("%s: failed to get cpu%d device\n", __func__,
-			       cpu);
-			continue;
-		}
-
-		if (of)
-			dev_pm_opp_of_remove_table(cpu_dev);
-		else
-			dev_pm_opp_remove_table(cpu_dev);
-	}
-}
-
-/**
- * dev_pm_opp_cpumask_remove_table() - Removes OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be removed
- *
- * This removes the OPP tables for CPUs present in the @cpumask.
- * This should be used to remove all the OPPs entries associated with
- * the cpus in @cpumask.
- */
-void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask)
-{
-	_dev_pm_opp_cpumask_remove_table(cpumask, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table);
-
-/**
- * dev_pm_opp_set_sharing_cpus() - Mark OPP table as shared by few CPUs
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask of the CPUs which share the OPP table with @cpu_dev
- *
- * This marks OPP table of the @cpu_dev as shared by the CPUs present in
- * @cpumask.
- *
- * Returns -ENODEV if OPP table isn't already present.
- */
-int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev,
-				const struct cpumask *cpumask)
-{
-	struct opp_device *opp_dev;
-	struct opp_table *opp_table;
-	struct device *dev;
-	int cpu, ret = 0;
-
-	opp_table = _find_opp_table(cpu_dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	for_each_cpu(cpu, cpumask) {
-		if (cpu == cpu_dev->id)
-			continue;
-
-		dev = get_cpu_device(cpu);
-		if (!dev) {
-			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
-				__func__, cpu);
-			continue;
-		}
-
-		opp_dev = _add_opp_dev(dev, opp_table);
-		if (!opp_dev) {
-			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
-				__func__, cpu);
-			continue;
-		}
-
-		/* Mark opp-table as multiple CPUs are sharing it now */
-		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
-
-/**
- * dev_pm_opp_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with @cpu_dev
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask to update with information of sharing CPUs
- *
- * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
- *
- * Returns -ENODEV if OPP table isn't already present and -EINVAL if the OPP
- * table's status is access-unknown.
- */
-int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
-{
-	struct opp_device *opp_dev;
-	struct opp_table *opp_table;
-	int ret = 0;
-
-	opp_table = _find_opp_table(cpu_dev);
-	if (IS_ERR(opp_table))
-		return PTR_ERR(opp_table);
-
-	if (opp_table->shared_opp == OPP_TABLE_ACCESS_UNKNOWN) {
-		ret = -EINVAL;
-		goto put_opp_table;
-	}
-
-	cpumask_clear(cpumask);
-
-	if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
-		list_for_each_entry(opp_dev, &opp_table->dev_list, node)
-			cpumask_set_cpu(opp_dev->dev->id, cpumask);
-	} else {
-		cpumask_set_cpu(cpu_dev->id, cpumask);
-	}
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_get_sharing_cpus);
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
deleted file mode 100644
index 81cf120fcf43..000000000000
--- a/drivers/base/power/opp/debugfs.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Generic OPP debugfs interface
- *
- * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/init.h>
-#include <linux/limits.h>
-#include <linux/slab.h>
-
-#include "opp.h"
-
-static struct dentry *rootdir;
-
-static void opp_set_dev_name(const struct device *dev, char *name)
-{
-	if (dev->parent)
-		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
-			 dev_name(dev));
-	else
-		snprintf(name, NAME_MAX, "%s", dev_name(dev));
-}
-
-void opp_debug_remove_one(struct dev_pm_opp *opp)
-{
-	debugfs_remove_recursive(opp->dentry);
-}
-
-static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
-				      struct opp_table *opp_table,
-				      struct dentry *pdentry)
-{
-	struct dentry *d;
-	int i;
-	char *name;
-
-	for (i = 0; i < opp_table->regulator_count; i++) {
-		name = kasprintf(GFP_KERNEL, "supply-%d", i);
-
-		/* Create per-opp directory */
-		d = debugfs_create_dir(name, pdentry);
-
-		kfree(name);
-
-		if (!d)
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
-					  &opp->supplies[i].u_volt))
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_min))
-			return false;
-
-		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
-					  &opp->supplies[i].u_volt_max))
-			return false;
-
-		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
-					  &opp->supplies[i].u_amp))
-			return false;
-	}
-
-	return true;
-}
-
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
-{
-	struct dentry *pdentry = opp_table->dentry;
-	struct dentry *d;
-	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
-
-	/* Rate is unique to each OPP, use it to give opp-name */
-	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
-
-	/* Create per-opp directory */
-	d = debugfs_create_dir(name, pdentry);
-	if (!d)
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
-		return -ENOMEM;
-
-	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
-		return -ENOMEM;
-
-	if (!opp_debug_create_supplies(opp, opp_table, d))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
-				  &opp->clock_latency_ns))
-		return -ENOMEM;
-
-	opp->dentry = d;
-	return 0;
-}
-
-static int opp_list_debug_create_dir(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
-{
-	const struct device *dev = opp_dev->dev;
-	struct dentry *d;
-
-	opp_set_dev_name(dev, opp_table->dentry_name);
-
-	/* Create device specific directory */
-	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
-		return -ENOMEM;
-	}
-
-	opp_dev->dentry = d;
-	opp_table->dentry = d;
-
-	return 0;
-}
-
-static int opp_list_debug_create_link(struct opp_device *opp_dev,
-				      struct opp_table *opp_table)
-{
-	const struct device *dev = opp_dev->dev;
-	char name[NAME_MAX];
-	struct dentry *d;
-
-	opp_set_dev_name(opp_dev->dev, name);
-
-	/* Create device specific directory link */
-	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
-	if (!d) {
-		dev_err(dev, "%s: Failed to create link\n", __func__);
-		return -ENOMEM;
-	}
-
-	opp_dev->dentry = d;
-
-	return 0;
-}
-
-/**
- * opp_debug_register - add a device opp node to the debugfs 'opp' directory
- * @opp_dev: opp-dev pointer for device
- * @opp_table: the device-opp being added
- *
- * Dynamically adds device specific directory in debugfs 'opp' directory. If the
- * device-opp is shared with other devices, then links will be created for all
- * devices except the first.
- *
- * Return: 0 on success, otherwise negative error.
- */
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
-{
-	if (!rootdir) {
-		pr_debug("%s: Uninitialized rootdir\n", __func__);
-		return -EINVAL;
-	}
-
-	if (opp_table->dentry)
-		return opp_list_debug_create_link(opp_dev, opp_table);
-
-	return opp_list_debug_create_dir(opp_dev, opp_table);
-}
-
-static void opp_migrate_dentry(struct opp_device *opp_dev,
-			       struct opp_table *opp_table)
-{
-	struct opp_device *new_dev;
-	const struct device *dev;
-	struct dentry *dentry;
-
-	/* Look for next opp-dev */
-	list_for_each_entry(new_dev, &opp_table->dev_list, node)
-		if (new_dev != opp_dev)
-			break;
-
-	/* new_dev is guaranteed to be valid here */
-	dev = new_dev->dev;
-	debugfs_remove_recursive(new_dev->dentry);
-
-	opp_set_dev_name(dev, opp_table->dentry_name);
-
-	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
-				opp_table->dentry_name);
-	if (!dentry) {
-		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
-			__func__, dev_name(opp_dev->dev), dev_name(dev));
-		return;
-	}
-
-	new_dev->dentry = dentry;
-	opp_table->dentry = dentry;
-}
-
-/**
- * opp_debug_unregister - remove a device opp node from debugfs opp directory
- * @opp_dev: opp-dev pointer for device
- * @opp_table: the device-opp being removed
- *
- * Dynamically removes device specific directory from debugfs 'opp' directory.
- */
-void opp_debug_unregister(struct opp_device *opp_dev,
-			  struct opp_table *opp_table)
-{
-	if (opp_dev->dentry == opp_table->dentry) {
-		/* Move the real dentry object under another device */
-		if (!list_is_singular(&opp_table->dev_list)) {
-			opp_migrate_dentry(opp_dev, opp_table);
-			goto out;
-		}
-		opp_table->dentry = NULL;
-	}
-
-	debugfs_remove_recursive(opp_dev->dentry);
-
-out:
-	opp_dev->dentry = NULL;
-}
-
-static int __init opp_debug_init(void)
-{
-	/* Create /sys/kernel/debug/opp directory */
-	rootdir = debugfs_create_dir("opp", NULL);
-	if (!rootdir) {
-		pr_err("%s: Failed to create root directory\n", __func__);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-core_initcall(opp_debug_init);
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
deleted file mode 100644
index 0b718886479b..000000000000
--- a/drivers/base/power/opp/of.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Generic OPP OF helpers
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/cpu.h>
-#include <linux/errno.h>
-#include <linux/device.h>
-#include <linux/of.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-#include "opp.h"
-
-static struct opp_table *_managed_opp(const struct device_node *np)
-{
-	struct opp_table *opp_table, *managed_table = NULL;
-
-	mutex_lock(&opp_table_lock);
-
-	list_for_each_entry(opp_table, &opp_tables, node) {
-		if (opp_table->np == np) {
-			/*
-			 * Multiple devices can point to the same OPP table and
-			 * so will have same node-pointer, np.
-			 *
-			 * But the OPPs will be considered as shared only if the
-			 * OPP table contains a "opp-shared" property.
-			 */
-			if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
-				_get_opp_table_kref(opp_table);
-				managed_table = opp_table;
-			}
-
-			break;
-		}
-	}
-
-	mutex_unlock(&opp_table_lock);
-
-	return managed_table;
-}
-
-void _of_init_opp_table(struct opp_table *opp_table, struct device *dev)
-{
-	struct device_node *np;
-
-	/*
-	 * Only required for backward compatibility with v1 bindings, but isn't
-	 * harmful for other cases. And so we do it unconditionally.
-	 */
-	np = of_node_get(dev->of_node);
-	if (np) {
-		u32 val;
-
-		if (!of_property_read_u32(np, "clock-latency", &val))
-			opp_table->clock_latency_ns_max = val;
-		of_property_read_u32(np, "voltage-tolerance",
-				     &opp_table->voltage_tolerance_v1);
-		of_node_put(np);
-	}
-}
-
-static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
-			      struct device_node *np)
-{
-	unsigned int count = opp_table->supported_hw_count;
-	u32 version;
-	int ret;
-
-	if (!opp_table->supported_hw) {
-		/*
-		 * In the case that no supported_hw has been set by the
-		 * platform but there is an opp-supported-hw value set for
-		 * an OPP then the OPP should not be enabled as there is
-		 * no way to see if the hardware supports it.
-		 */
-		if (of_find_property(np, "opp-supported-hw", NULL))
-			return false;
-		else
-			return true;
-	}
-
-	while (count--) {
-		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
-						 &version);
-		if (ret) {
-			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
-				 __func__, count, ret);
-			return false;
-		}
-
-		/* Both of these are bitwise masks of the versions */
-		if (!(version & opp_table->supported_hw[count]))
-			return false;
-	}
-
-	return true;
-}
-
-static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
-			      struct opp_table *opp_table)
-{
-	u32 *microvolt, *microamp = NULL;
-	int supplies, vcount, icount, ret, i, j;
-	struct property *prop = NULL;
-	char name[NAME_MAX];
-
-	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
-
-	/* Search for "opp-microvolt-<name>" */
-	if (opp_table->prop_name) {
-		snprintf(name, sizeof(name), "opp-microvolt-%s",
-			 opp_table->prop_name);
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (!prop) {
-		/* Search for "opp-microvolt" */
-		sprintf(name, "opp-microvolt");
-		prop = of_find_property(opp->np, name, NULL);
-
-		/* Missing property isn't a problem, but an invalid entry is */
-		if (!prop) {
-			if (!opp_table->regulator_count)
-				return 0;
-
-			dev_err(dev, "%s: opp-microvolt missing although OPP managing regulators\n",
-				__func__);
-			return -EINVAL;
-		}
-	}
-
-	vcount = of_property_count_u32_elems(opp->np, name);
-	if (vcount < 0) {
-		dev_err(dev, "%s: Invalid %s property (%d)\n",
-			__func__, name, vcount);
-		return vcount;
-	}
-
-	/* There can be one or three elements per supply */
-	if (vcount != supplies && vcount != supplies * 3) {
-		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
-			__func__, name, vcount, supplies);
-		return -EINVAL;
-	}
-
-	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
-	if (!microvolt)
-		return -ENOMEM;
-
-	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
-	if (ret) {
-		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
-		ret = -EINVAL;
-		goto free_microvolt;
-	}
-
-	/* Search for "opp-microamp-<name>" */
-	prop = NULL;
-	if (opp_table->prop_name) {
-		snprintf(name, sizeof(name), "opp-microamp-%s",
-			 opp_table->prop_name);
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (!prop) {
-		/* Search for "opp-microamp" */
-		sprintf(name, "opp-microamp");
-		prop = of_find_property(opp->np, name, NULL);
-	}
-
-	if (prop) {
-		icount = of_property_count_u32_elems(opp->np, name);
-		if (icount < 0) {
-			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
-				name, icount);
-			ret = icount;
-			goto free_microvolt;
-		}
-
-		if (icount != supplies) {
-			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
-				__func__, name, icount, supplies);
-			ret = -EINVAL;
-			goto free_microvolt;
-		}
-
-		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
-		if (!microamp) {
-			ret = -EINVAL;
-			goto free_microvolt;
-		}
-
-		ret = of_property_read_u32_array(opp->np, name, microamp,
-						 icount);
-		if (ret) {
-			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
-				name, ret);
-			ret = -EINVAL;
-			goto free_microamp;
-		}
-	}
-
-	for (i = 0, j = 0; i < supplies; i++) {
-		opp->supplies[i].u_volt = microvolt[j++];
-
-		if (vcount == supplies) {
-			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
-			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
-		} else {
-			opp->supplies[i].u_volt_min = microvolt[j++];
-			opp->supplies[i].u_volt_max = microvolt[j++];
-		}
-
-		if (microamp)
-			opp->supplies[i].u_amp = microamp[i];
-	}
-
-free_microamp:
-	kfree(microamp);
-free_microvolt:
-	kfree(microvolt);
-
-	return ret;
-}
-
-/**
- * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
- *				  entries
- * @dev:	device pointer used to lookup OPP table.
- *
- * Free OPPs created using static entries present in DT.
- */
-void dev_pm_opp_of_remove_table(struct device *dev)
-{
-	_dev_pm_opp_find_and_remove_table(dev, false);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
-
-/* Returns opp descriptor node for a device node, caller must
- * do of_node_put() */
-static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np)
-{
-	/*
-	 * There should be only ONE phandle present in "operating-points-v2"
-	 * property.
-	 */
-
-	return of_parse_phandle(np, "operating-points-v2", 0);
-}
-
-/* Returns opp descriptor node for a device, caller must do of_node_put() */
-struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
-{
-	return _opp_of_get_opp_desc_node(dev->of_node);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
-
-/**
- * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
- * @opp_table:	OPP table
- * @dev:	device for which we do this operation
- * @np:		device node
- *
- * This function adds an opp definition to the opp table and returns status. The
- * opp can be controlled using dev_pm_opp_enable/disable functions and may be
- * removed by dev_pm_opp_remove.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- * -EINVAL	Failed parsing the OPP node
- */
-static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev,
-			      struct device_node *np)
-{
-	struct dev_pm_opp *new_opp;
-	u64 rate;
-	u32 val;
-	int ret;
-
-	new_opp = _opp_allocate(opp_table);
-	if (!new_opp)
-		return -ENOMEM;
-
-	ret = of_property_read_u64(np, "opp-hz", &rate);
-	if (ret < 0) {
-		dev_err(dev, "%s: opp-hz not found\n", __func__);
-		goto free_opp;
-	}
-
-	/* Check if the OPP supports hardware's hierarchy of versions or not */
-	if (!_opp_is_supported(dev, opp_table, np)) {
-		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
-		goto free_opp;
-	}
-
-	/*
-	 * Rate is defined as an unsigned long in clk API, and so casting
-	 * explicitly to its type. Must be fixed once rate is 64 bit
-	 * guaranteed in clk API.
-	 */
-	new_opp->rate = (unsigned long)rate;
-	new_opp->turbo = of_property_read_bool(np, "turbo-mode");
-
-	new_opp->np = np;
-	new_opp->dynamic = false;
-	new_opp->available = true;
-
-	if (!of_property_read_u32(np, "clock-latency-ns", &val))
-		new_opp->clock_latency_ns = val;
-
-	ret = opp_parse_supplies(new_opp, dev, opp_table);
-	if (ret)
-		goto free_opp;
-
-	ret = _opp_add(dev, new_opp, opp_table);
-	if (ret) {
-		/* Don't return error for duplicate OPPs */
-		if (ret == -EBUSY)
-			ret = 0;
-		goto free_opp;
-	}
-
-	/* OPP to select on device suspend */
-	if (of_property_read_bool(np, "opp-suspend")) {
-		if (opp_table->suspend_opp) {
-			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
-				 __func__, opp_table->suspend_opp->rate,
-				 new_opp->rate);
-		} else {
-			new_opp->suspend = true;
-			opp_table->suspend_opp = new_opp;
-		}
-	}
-
-	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
-		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
-
-	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
-		 __func__, new_opp->turbo, new_opp->rate,
-		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
-		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
-
-	/*
-	 * Notify the changes in the availability of the operable
-	 * frequency/voltage list.
-	 */
-	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
-	return 0;
-
-free_opp:
-	_opp_free(new_opp);
-
-	return ret;
-}
-
-/* Initializes OPP tables based on new bindings */
-static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
-{
-	struct device_node *np;
-	struct opp_table *opp_table;
-	int ret = 0, count = 0;
-
-	opp_table = _managed_opp(opp_np);
-	if (opp_table) {
-		/* OPPs are already managed */
-		if (!_add_opp_dev(dev, opp_table))
-			ret = -ENOMEM;
-		goto put_opp_table;
-	}
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	/* We have opp-table node now, iterate over it and add OPPs */
-	for_each_available_child_of_node(opp_np, np) {
-		count++;
-
-		ret = _opp_add_static_v2(opp_table, dev, np);
-		if (ret) {
-			dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
-				ret);
-			_dev_pm_opp_remove_table(opp_table, dev, false);
-			goto put_opp_table;
-		}
-	}
-
-	/* There should be one of more OPP defined */
-	if (WARN_ON(!count)) {
-		ret = -ENOENT;
-		goto put_opp_table;
-	}
-
-	opp_table->np = opp_np;
-	if (of_property_read_bool(opp_np, "opp-shared"))
-		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
-	else
-		opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE;
-
-put_opp_table:
-	dev_pm_opp_put_opp_table(opp_table);
-
-	return ret;
-}
-
-/* Initializes OPP tables based on old-deprecated bindings */
-static int _of_add_opp_table_v1(struct device *dev)
-{
-	struct opp_table *opp_table;
-	const struct property *prop;
-	const __be32 *val;
-	int nr, ret = 0;
-
-	prop = of_find_property(dev->of_node, "operating-points", NULL);
-	if (!prop)
-		return -ENODEV;
-	if (!prop->value)
-		return -ENODATA;
-
-	/*
-	 * Each OPP is a set of tuples consisting of frequency and
-	 * voltage like <freq-kHz vol-uV>.
-	 */
-	nr = prop->length / sizeof(u32);
-	if (nr % 2) {
-		dev_err(dev, "%s: Invalid OPP table\n", __func__);
-		return -EINVAL;
-	}
-
-	opp_table = dev_pm_opp_get_opp_table(dev);
-	if (!opp_table)
-		return -ENOMEM;
-
-	val = prop->value;
-	while (nr) {
-		unsigned long freq = be32_to_cpup(val++) * 1000;
-		unsigned long volt = be32_to_cpup(val++);
-
-		ret = _opp_add_v1(opp_table, dev, freq, volt, false);
-		if (ret) {
-			dev_err(dev, "%s: Failed to add OPP %ld (%d)\n",
-				__func__, freq, ret);
-			_dev_pm_opp_remove_table(opp_table, dev, false);
-			break;
-		}
-		nr -= 2;
-	}
-
-	dev_pm_opp_put_opp_table(opp_table);
-	return ret;
-}
-
-/**
- * dev_pm_opp_of_add_table() - Initialize opp table from device tree
- * @dev:	device pointer used to lookup OPP table.
- *
- * Register the initial OPP table with the OPP library for given device.
- *
- * Return:
- * 0		On success OR
- *		Duplicate OPPs (both freq and volt are same) and opp->available
- * -EEXIST	Freq are same and volt are different OR
- *		Duplicate OPPs (both freq and volt are same) and !opp->available
- * -ENOMEM	Memory allocation failure
- * -ENODEV	when 'operating-points' property is not found or is invalid data
- *		in device node.
- * -ENODATA	when empty 'operating-points' property is found
- * -EINVAL	when invalid entries are found in opp-v2 table
- */
-int dev_pm_opp_of_add_table(struct device *dev)
-{
-	struct device_node *opp_np;
-	int ret;
-
-	/*
-	 * OPPs have two version of bindings now. The older one is deprecated,
-	 * try for the new binding first.
-	 */
-	opp_np = dev_pm_opp_of_get_opp_desc_node(dev);
-	if (!opp_np) {
-		/*
-		 * Try old-deprecated bindings for backward compatibility with
-		 * older dtbs.
-		 */
-		return _of_add_opp_table_v1(dev);
-	}
-
-	ret = _of_add_opp_table_v2(dev, opp_np);
-	of_node_put(opp_np);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
-
-/* CPU device specific helpers */
-
-/**
- * dev_pm_opp_of_cpumask_remove_table() - Removes OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be removed
- *
- * This removes the OPP tables for CPUs present in the @cpumask.
- * This should be used only to remove static entries created from DT.
- */
-void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask)
-{
-	_dev_pm_opp_cpumask_remove_table(cpumask, true);
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
-
-/**
- * dev_pm_opp_of_cpumask_add_table() - Adds OPP table for @cpumask
- * @cpumask:	cpumask for which OPP table needs to be added.
- *
- * This adds the OPP tables for CPUs present in the @cpumask.
- */
-int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
-{
-	struct device *cpu_dev;
-	int cpu, ret = 0;
-
-	WARN_ON(cpumask_empty(cpumask));
-
-	for_each_cpu(cpu, cpumask) {
-		cpu_dev = get_cpu_device(cpu);
-		if (!cpu_dev) {
-			pr_err("%s: failed to get cpu%d device\n", __func__,
-			       cpu);
-			continue;
-		}
-
-		ret = dev_pm_opp_of_add_table(cpu_dev);
-		if (ret) {
-			/*
-			 * OPP may get registered dynamically, don't print error
-			 * message here.
-			 */
-			pr_debug("%s: couldn't find opp table for cpu:%d, %d\n",
-				 __func__, cpu, ret);
-
-			/* Free all other OPPs */
-			dev_pm_opp_of_cpumask_remove_table(cpumask);
-			break;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
-
-/*
- * Works only for OPP v2 bindings.
- *
- * Returns -ENOENT if operating-points-v2 bindings aren't supported.
- */
-/**
- * dev_pm_opp_of_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with
- *				      @cpu_dev using operating-points-v2
- *				      bindings.
- *
- * @cpu_dev:	CPU device for which we do this operation
- * @cpumask:	cpumask to update with information of sharing CPUs
- *
- * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
- *
- * Returns -ENOENT if operating-points-v2 isn't present for @cpu_dev.
- */
-int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
-				   struct cpumask *cpumask)
-{
-	struct device_node *np, *tmp_np, *cpu_np;
-	int cpu, ret = 0;
-
-	/* Get OPP descriptor node */
-	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
-	if (!np) {
-		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
-		return -ENOENT;
-	}
-
-	cpumask_set_cpu(cpu_dev->id, cpumask);
-
-	/* OPPs are shared ? */
-	if (!of_property_read_bool(np, "opp-shared"))
-		goto put_cpu_node;
-
-	for_each_possible_cpu(cpu) {
-		if (cpu == cpu_dev->id)
-			continue;
-
-		cpu_np = of_get_cpu_node(cpu, NULL);
-		if (!cpu_np) {
-			dev_err(cpu_dev, "%s: failed to get cpu%d node\n",
-				__func__, cpu);
-			ret = -ENOENT;
-			goto put_cpu_node;
-		}
-
-		/* Get OPP descriptor node */
-		tmp_np = _opp_of_get_opp_desc_node(cpu_np);
-		if (!tmp_np) {
-			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
-			ret = -ENOENT;
-			goto put_cpu_node;
-		}
-
-		/* CPUs are sharing opp node */
-		if (np == tmp_np)
-			cpumask_set_cpu(cpu, cpumask);
-
-		of_node_put(tmp_np);
-	}
-
-put_cpu_node:
-	of_node_put(np);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
deleted file mode 100644
index 166eef990599..000000000000
--- a/drivers/base/power/opp/opp.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Generic OPP Interface
- *
- * Copyright (C) 2009-2010 Texas Instruments Incorporated.
- *	Nishanth Menon
- *	Romit Dasgupta
- *	Kevin Hilman
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __DRIVER_OPP_H__
-#define __DRIVER_OPP_H__
-
-#include <linux/device.h>
-#include <linux/kernel.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/limits.h>
-#include <linux/pm_opp.h>
-#include <linux/notifier.h>
-
-struct clk;
-struct regulator;
-
-/* Lock to allow exclusive modification to the device and opp lists */
-extern struct mutex opp_table_lock;
-
-extern struct list_head opp_tables;
-
-/*
- * Internal data structure organization with the OPP layer library is as
- * follows:
- * opp_tables (root)
- *	|- device 1 (represents voltage domain 1)
- *	|	|- opp 1 (availability, freq, voltage)
- *	|	|- opp 2 ..
- *	...	...
- *	|	`- opp n ..
- *	|- device 2 (represents the next voltage domain)
- *	...
- *	`- device m (represents mth voltage domain)
- * device 1, 2.. are represented by opp_table structure while each opp
- * is represented by the opp structure.
- */
-
-/**
- * struct dev_pm_opp - Generic OPP description structure
- * @node:	opp table node. The nodes are maintained throughout the lifetime
- *		of boot. It is expected only an optimal set of OPPs are
- *		added to the library by the SoC framework.
- *		IMPORTANT: the opp nodes should be maintained in increasing
- *		order.
- * @kref:	for reference count of the OPP.
- * @available:	true/false - marks if this OPP as available or not
- * @dynamic:	not-created from static DT entries.
- * @turbo:	true if turbo (boost) OPP
- * @suspend:	true if suspend OPP
- * @rate:	Frequency in hertz
- * @supplies:	Power supplies voltage/current values
- * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
- *		frequency from any other OPP's frequency.
- * @opp_table:	points back to the opp_table struct this opp belongs to
- * @np:		OPP's device node.
- * @dentry:	debugfs dentry pointer (per opp)
- *
- * This structure stores the OPP information for a given device.
- */
-struct dev_pm_opp {
-	struct list_head node;
-	struct kref kref;
-
-	bool available;
-	bool dynamic;
-	bool turbo;
-	bool suspend;
-	unsigned long rate;
-
-	struct dev_pm_opp_supply *supplies;
-
-	unsigned long clock_latency_ns;
-
-	struct opp_table *opp_table;
-
-	struct device_node *np;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-#endif
-};
-
-/**
- * struct opp_device - devices managed by 'struct opp_table'
- * @node:	list node
- * @dev:	device to which the struct object belongs
- * @dentry:	debugfs dentry pointer (per device)
- *
- * This is an internal data structure maintaining the devices that are managed
- * by 'struct opp_table'.
- */
-struct opp_device {
-	struct list_head node;
-	const struct device *dev;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-#endif
-};
-
-enum opp_table_access {
-	OPP_TABLE_ACCESS_UNKNOWN = 0,
-	OPP_TABLE_ACCESS_EXCLUSIVE = 1,
-	OPP_TABLE_ACCESS_SHARED = 2,
-};
-
-/**
- * struct opp_table - Device opp structure
- * @node:	table node - contains the devices with OPPs that
- *		have been registered. Nodes once added are not modified in this
- *		table.
- * @head:	notifier head to notify the OPP availability changes.
- * @dev_list:	list of devices that share these OPPs
- * @opp_list:	table of opps
- * @kref:	for reference count of the table.
- * @lock:	mutex protecting the opp_list.
- * @np:		struct device_node pointer for opp's DT node.
- * @clock_latency_ns_max: Max clock latency in nanoseconds.
- * @shared_opp: OPP is shared between multiple devices.
- * @suspend_opp: Pointer to OPP to be used during device suspend.
- * @supported_hw: Array of version number to support.
- * @supported_hw_count: Number of elements in supported_hw array.
- * @prop_name: A name to postfix to many DT properties, while parsing them.
- * @clk: Device's clock handle
- * @regulators: Supply regulators
- * @regulator_count: Number of power supply regulators
- * @set_opp: Platform specific set_opp callback
- * @set_opp_data: Data to be passed to set_opp callback
- * @dentry:	debugfs dentry pointer of the real device directory (not links).
- * @dentry_name: Name of the real dentry.
- *
- * @voltage_tolerance_v1: In percentage, for v1 bindings only.
- *
- * This is an internal data structure maintaining the link to opps attached to
- * a device. This structure is not meant to be shared to users as it is
- * meant for book keeping and private to OPP library.
- */
-struct opp_table {
-	struct list_head node;
-
-	struct blocking_notifier_head head;
-	struct list_head dev_list;
-	struct list_head opp_list;
-	struct kref kref;
-	struct mutex lock;
-
-	struct device_node *np;
-	unsigned long clock_latency_ns_max;
-
-	/* For backward compatibility with v1 bindings */
-	unsigned int voltage_tolerance_v1;
-
-	enum opp_table_access shared_opp;
-	struct dev_pm_opp *suspend_opp;
-
-	unsigned int *supported_hw;
-	unsigned int supported_hw_count;
-	const char *prop_name;
-	struct clk *clk;
-	struct regulator **regulators;
-	unsigned int regulator_count;
-
-	int (*set_opp)(struct dev_pm_set_opp_data *data);
-	struct dev_pm_set_opp_data *set_opp_data;
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dentry;
-	char dentry_name[NAME_MAX];
-#endif
-};
-
-/* Routines internal to opp core */
-void _get_opp_table_kref(struct opp_table *opp_table);
-struct opp_table *_find_opp_table(struct device *dev);
-struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
-void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, bool remove_all);
-void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all);
-struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
-void _opp_free(struct dev_pm_opp *opp);
-int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
-int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
-void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of);
-struct opp_table *_add_opp_table(struct device *dev);
-
-#ifdef CONFIG_OF
-void _of_init_opp_table(struct opp_table *opp_table, struct device *dev);
-#else
-static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) {}
-#endif
-
-#ifdef CONFIG_DEBUG_FS
-void opp_debug_remove_one(struct dev_pm_opp *opp);
-int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
-int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
-void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
-#else
-static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
-
-static inline int opp_debug_create_one(struct dev_pm_opp *opp,
-				       struct opp_table *opp_table)
-{ return 0; }
-static inline int opp_debug_register(struct opp_device *opp_dev,
-				     struct opp_table *opp_table)
-{ return 0; }
-
-static inline void opp_debug_unregister(struct opp_device *opp_dev,
-					struct opp_table *opp_table)
-{ }
-#endif		/* DEBUG_FS */
-
-#endif		/* __DRIVER_OPP_H__ */
diff --git a/drivers/opp/Kconfig b/drivers/opp/Kconfig
new file mode 100644
index 000000000000..a7fbb93f302c
--- /dev/null
+++ b/drivers/opp/Kconfig
@@ -0,0 +1,13 @@
+config PM_OPP
+	bool
+	select SRCU
+	---help---
+	  SOCs have a standard set of tuples consisting of frequency and
+	  voltage pairs that the device will support per voltage domain. This
+	  is called Operating Performance Point or OPP. The actual definitions
+	  of OPP varies over silicon within the same family of devices.
+
+	  OPP layer organizes the data internally using device pointers
+	  representing individual voltage domains and provides SOC
+	  implementations a ready to use framework to manage OPPs.
+	  For more information, read <file:Documentation/power/opp.txt>
diff --git a/drivers/opp/Makefile b/drivers/opp/Makefile
new file mode 100644
index 000000000000..e70ceb406fe9
--- /dev/null
+++ b/drivers/opp/Makefile
@@ -0,0 +1,4 @@
+ccflags-$(CONFIG_DEBUG_DRIVER)	:= -DDEBUG
+obj-y				+= core.o cpu.o
+obj-$(CONFIG_OF)		+= of.o
+obj-$(CONFIG_DEBUG_FS)		+= debugfs.o
diff --git a/drivers/opp/core.c b/drivers/opp/core.c
new file mode 100644
index 000000000000..a6de32530693
--- /dev/null
+++ b/drivers/opp/core.c
@@ -0,0 +1,1747 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/clk.h>
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/export.h>
+#include <linux/regulator/consumer.h>
+
+#include "opp.h"
+
+/*
+ * The root of the list of all opp-tables. All opp_table structures branch off
+ * from here, with each opp_table containing the list of opps it supports in
+ * various states of availability.
+ */
+LIST_HEAD(opp_tables);
+/* Lock to allow exclusive modification to the device and opp lists */
+DEFINE_MUTEX(opp_table_lock);
+
+static void dev_pm_opp_get(struct dev_pm_opp *opp);
+
+static struct opp_device *_find_opp_dev(const struct device *dev,
+					struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+
+	list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+		if (opp_dev->dev == dev)
+			return opp_dev;
+
+	return NULL;
+}
+
+static struct opp_table *_find_opp_table_unlocked(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	list_for_each_entry(opp_table, &opp_tables, node) {
+		if (_find_opp_dev(dev, opp_table)) {
+			_get_opp_table_kref(opp_table);
+
+			return opp_table;
+		}
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+/**
+ * _find_opp_table() - find opp_table struct using device pointer
+ * @dev:	device pointer used to lookup OPP table
+ *
+ * Search OPP table for one containing matching device.
+ *
+ * Return: pointer to 'struct opp_table' if found, otherwise -ENODEV or
+ * -EINVAL based on type of error.
+ *
+ * The callers must call dev_pm_opp_put_opp_table() after the table is used.
+ */
+struct opp_table *_find_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	if (IS_ERR_OR_NULL(dev)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mutex_lock(&opp_table_lock);
+	opp_table = _find_opp_table_unlocked(dev);
+	mutex_unlock(&opp_table_lock);
+
+	return opp_table;
+}
+
+/**
+ * dev_pm_opp_get_voltage() - Gets the voltage corresponding to an opp
+ * @opp:	opp for which voltage has to be returned for
+ *
+ * Return: voltage in micro volt corresponding to the opp, else
+ * return 0
+ *
+ * This is useful only for devices with single power supply.
+ */
+unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp)) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+
+	return opp->supplies[0].u_volt;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_voltage);
+
+/**
+ * dev_pm_opp_get_freq() - Gets the frequency corresponding to an available opp
+ * @opp:	opp for which frequency has to be returned for
+ *
+ * Return: frequency in hertz corresponding to the opp, else
+ * return 0
+ */
+unsigned long dev_pm_opp_get_freq(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp) || !opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return 0;
+	}
+
+	return opp->rate;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_freq);
+
+/**
+ * dev_pm_opp_is_turbo() - Returns if opp is turbo OPP or not
+ * @opp: opp for which turbo mode is being verified
+ *
+ * Turbo OPPs are not for normal use, and can be enabled (under certain
+ * conditions) for short duration of times to finish high throughput work
+ * quickly. Running on them for longer times may overheat the chip.
+ *
+ * Return: true if opp is turbo opp, else false.
+ */
+bool dev_pm_opp_is_turbo(struct dev_pm_opp *opp)
+{
+	if (IS_ERR_OR_NULL(opp) || !opp->available) {
+		pr_err("%s: Invalid parameters\n", __func__);
+		return false;
+	}
+
+	return opp->turbo;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_is_turbo);
+
+/**
+ * dev_pm_opp_get_max_clock_latency() - Get max clock latency in nanoseconds
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the max clock latency in nanoseconds.
+ */
+unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	unsigned long clock_latency_ns;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	clock_latency_ns = opp_table->clock_latency_ns_max;
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return clock_latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
+
+/**
+ * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max voltage latency in nanoseconds.
+ */
+unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp;
+	struct regulator *reg;
+	unsigned long latency_ns = 0;
+	int ret, i, count;
+	struct {
+		unsigned long min;
+		unsigned long max;
+	} *uV;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	count = opp_table->regulator_count;
+
+	/* Regulator may not be required for the device */
+	if (!count)
+		goto put_opp_table;
+
+	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
+	if (!uV)
+		goto put_opp_table;
+
+	mutex_lock(&opp_table->lock);
+
+	for (i = 0; i < count; i++) {
+		uV[i].min = ~0;
+		uV[i].max = 0;
+
+		list_for_each_entry(opp, &opp_table->opp_list, node) {
+			if (!opp->available)
+				continue;
+
+			if (opp->supplies[i].u_volt_min < uV[i].min)
+				uV[i].min = opp->supplies[i].u_volt_min;
+			if (opp->supplies[i].u_volt_max > uV[i].max)
+				uV[i].max = opp->supplies[i].u_volt_max;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	/*
+	 * The caller needs to ensure that opp_table (and hence the regulator)
+	 * isn't freed, while we are executing this routine.
+	 */
+	for (i = 0; i < count; i++) {
+		reg = opp_table->regulators[i];
+		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
+		if (ret > 0)
+			latency_ns += ret * 1000;
+	}
+
+	kfree(uV);
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return latency_ns;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_volt_latency);
+
+/**
+ * dev_pm_opp_get_max_transition_latency() - Get max transition latency in
+ *					     nanoseconds
+ * @dev: device for which we do this operation
+ *
+ * Return: This function returns the max transition latency, in nanoseconds, to
+ * switch from one OPP to other.
+ */
+unsigned long dev_pm_opp_get_max_transition_latency(struct device *dev)
+{
+	return dev_pm_opp_get_max_volt_latency(dev) +
+		dev_pm_opp_get_max_clock_latency(dev);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_transition_latency);
+
+/**
+ * dev_pm_opp_get_suspend_opp_freq() - Get frequency of suspend opp in Hz
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the frequency of the OPP marked as suspend_opp
+ * if one is available, else returns 0;
+ */
+unsigned long dev_pm_opp_get_suspend_opp_freq(struct device *dev)
+{
+	struct opp_table *opp_table;
+	unsigned long freq = 0;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return 0;
+
+	if (opp_table->suspend_opp && opp_table->suspend_opp->available)
+		freq = dev_pm_opp_get_freq(opp_table->suspend_opp);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return freq;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_suspend_opp_freq);
+
+/**
+ * dev_pm_opp_get_opp_count() - Get number of opps available in the opp table
+ * @dev:	device for which we do this operation
+ *
+ * Return: This function returns the number of available opps if there are any,
+ * else returns 0 if none or the corresponding error value.
+ */
+int dev_pm_opp_get_opp_count(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp;
+	int count = 0;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		count = PTR_ERR(opp_table);
+		dev_err(dev, "%s: OPP table not found (%d)\n",
+			__func__, count);
+		return count;
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available)
+			count++;
+	}
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return count;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_count);
+
+/**
+ * dev_pm_opp_find_freq_exact() - search for an exact frequency
+ * @dev:		device for which we do this operation
+ * @freq:		frequency to search for
+ * @available:		true/false - match for available opp
+ *
+ * Return: Searches for exact match in the opp table and returns pointer to the
+ * matching opp if found, else returns ERR_PTR in case of error and should
+ * be handled using IS_ERR. Error return values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * Note: available is a modifier for the search. if available=true, then the
+ * match is for exact matching frequency and is available in the stored OPP
+ * table. if false, the match is for exact frequency which is not available.
+ *
+ * This provides a mechanism to enable an opp which is not available currently
+ * or the opposite as well.
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_exact(struct device *dev,
+					      unsigned long freq,
+					      bool available)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int r = PTR_ERR(opp_table);
+
+		dev_err(dev, "%s: OPP table not found (%d)\n", __func__, r);
+		return ERR_PTR(r);
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available == available &&
+				temp_opp->rate == freq) {
+			opp = temp_opp;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_exact);
+
+static noinline struct dev_pm_opp *_find_freq_ceil(struct opp_table *opp_table,
+						   unsigned long *freq)
+{
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available && temp_opp->rate >= *freq) {
+			opp = temp_opp;
+			*freq = opp->rate;
+
+			/* Increment the reference count of OPP */
+			dev_pm_opp_get(opp);
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	return opp;
+}
+
+/**
+ * dev_pm_opp_find_freq_ceil() - Search for an rounded ceil freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching ceil *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_ceil(struct device *dev,
+					     unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *opp;
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	opp = _find_freq_ceil(opp_table, freq);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_ceil);
+
+/**
+ * dev_pm_opp_find_freq_floor() - Search for a rounded floor freq
+ * @dev:	device for which we do this operation
+ * @freq:	Start frequency
+ *
+ * Search for the matching floor *available* OPP from a starting freq
+ * for a device.
+ *
+ * Return: matching *opp and refreshes *freq accordingly, else returns
+ * ERR_PTR in case of error and should be handled using IS_ERR. Error return
+ * values can be:
+ * EINVAL:	for bad pointer
+ * ERANGE:	no match found for search
+ * ENODEV:	if device not found in list of registered devices
+ *
+ * The callers are required to call dev_pm_opp_put() for the returned OPP after
+ * use.
+ */
+struct dev_pm_opp *dev_pm_opp_find_freq_floor(struct device *dev,
+					      unsigned long *freq)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *temp_opp, *opp = ERR_PTR(-ERANGE);
+
+	if (!dev || !freq) {
+		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
+		return ERR_PTR(-EINVAL);
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return ERR_CAST(opp_table);
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(temp_opp, &opp_table->opp_list, node) {
+		if (temp_opp->available) {
+			/* go to the next node, before choosing prev */
+			if (temp_opp->rate > *freq)
+				break;
+			else
+				opp = temp_opp;
+		}
+	}
+
+	/* Increment the reference count of OPP */
+	if (!IS_ERR(opp))
+		dev_pm_opp_get(opp);
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+
+	if (!IS_ERR(opp))
+		*freq = opp->rate;
+
+	return opp;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_find_freq_floor);
+
+static int _set_opp_voltage(struct device *dev, struct regulator *reg,
+			    struct dev_pm_opp_supply *supply)
+{
+	int ret;
+
+	/* Regulator not available for device */
+	if (IS_ERR(reg)) {
+		dev_dbg(dev, "%s: regulator not available: %ld\n", __func__,
+			PTR_ERR(reg));
+		return 0;
+	}
+
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
+		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
+
+	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
+					    supply->u_volt, supply->u_volt_max);
+	if (ret)
+		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
+			__func__, supply->u_volt_min, supply->u_volt,
+			supply->u_volt_max, ret);
+
+	return ret;
+}
+
+static inline int
+_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+			  unsigned long old_freq, unsigned long freq)
+{
+	int ret;
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+	}
+
+	return ret;
+}
+
+static int _generic_set_opp_regulator(const struct opp_table *opp_table,
+				      struct device *dev,
+				      unsigned long old_freq,
+				      unsigned long freq,
+				      struct dev_pm_opp_supply *old_supply,
+				      struct dev_pm_opp_supply *new_supply)
+{
+	struct regulator *reg = opp_table->regulators[0];
+	int ret;
+
+	/* This function only supports single regulator per device */
+	if (WARN_ON(opp_table->regulator_count > 1)) {
+		dev_err(dev, "multiple regulators are not supported\n");
+		return -EINVAL;
+	}
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+	ret = _generic_set_opp_clk_only(dev, opp_table->clk, old_freq, freq);
+	if (ret)
+		goto restore_voltage;
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (_generic_set_opp_clk_only(dev, opp_table->clk, freq, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (old_supply)
+		_set_opp_voltage(dev, reg, old_supply);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_rate() - Configure new OPP based on frequency
+ * @dev:	 device for which we do this operation
+ * @target_freq: frequency to achieve
+ *
+ * This configures the power-supplies and clock source to the levels specified
+ * by the OPP corresponding to the target_freq.
+ */
+int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
+{
+	struct opp_table *opp_table;
+	unsigned long freq, old_freq;
+	struct dev_pm_opp *old_opp, *opp;
+	struct clk *clk;
+	int ret, size;
+
+	if (unlikely(!target_freq)) {
+		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
+			target_freq);
+		return -EINVAL;
+	}
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "%s: device opp doesn't exist\n", __func__);
+		return PTR_ERR(opp_table);
+	}
+
+	clk = opp_table->clk;
+	if (IS_ERR(clk)) {
+		dev_err(dev, "%s: No clock available for the device\n",
+			__func__);
+		ret = PTR_ERR(clk);
+		goto put_opp_table;
+	}
+
+	freq = clk_round_rate(clk, target_freq);
+	if ((long)freq <= 0)
+		freq = target_freq;
+
+	old_freq = clk_get_rate(clk);
+
+	/* Return early if nothing to do */
+	if (old_freq == freq) {
+		dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n",
+			__func__, freq);
+		ret = 0;
+		goto put_opp_table;
+	}
+
+	old_opp = _find_freq_ceil(opp_table, &old_freq);
+	if (IS_ERR(old_opp)) {
+		dev_err(dev, "%s: failed to find current OPP for freq %lu (%ld)\n",
+			__func__, old_freq, PTR_ERR(old_opp));
+	}
+
+	opp = _find_freq_ceil(opp_table, &freq);
+	if (IS_ERR(opp)) {
+		ret = PTR_ERR(opp);
+		dev_err(dev, "%s: failed to find OPP for freq %lu (%d)\n",
+			__func__, freq, ret);
+		goto put_old_opp;
+	}
+
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
+		old_freq, freq);
+
+	/* Only frequency scaling */
+	if (!opp_table->regulators) {
+		ret = _generic_set_opp_clk_only(dev, clk, old_freq, freq);
+	} else if (!opp_table->set_opp) {
+		ret = _generic_set_opp_regulator(opp_table, dev, old_freq, freq,
+						 IS_ERR(old_opp) ? NULL : old_opp->supplies,
+						 opp->supplies);
+	} else {
+		struct dev_pm_set_opp_data *data;
+
+		data = opp_table->set_opp_data;
+		data->regulators = opp_table->regulators;
+		data->regulator_count = opp_table->regulator_count;
+		data->clk = clk;
+		data->dev = dev;
+
+		data->old_opp.rate = old_freq;
+		size = sizeof(*opp->supplies) * opp_table->regulator_count;
+		if (IS_ERR(old_opp))
+			memset(data->old_opp.supplies, 0, size);
+		else
+			memcpy(data->old_opp.supplies, old_opp->supplies, size);
+
+		data->new_opp.rate = freq;
+		memcpy(data->new_opp.supplies, opp->supplies, size);
+
+		ret = opp_table->set_opp(data);
+	}
+
+	dev_pm_opp_put(opp);
+put_old_opp:
+	if (!IS_ERR(old_opp))
+		dev_pm_opp_put(old_opp);
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
+
+/* OPP-dev Helpers */
+static void _remove_opp_dev(struct opp_device *opp_dev,
+			    struct opp_table *opp_table)
+{
+	opp_debug_unregister(opp_dev, opp_table);
+	list_del(&opp_dev->node);
+	kfree(opp_dev);
+}
+
+struct opp_device *_add_opp_dev(const struct device *dev,
+				struct opp_table *opp_table)
+{
+	struct opp_device *opp_dev;
+	int ret;
+
+	opp_dev = kzalloc(sizeof(*opp_dev), GFP_KERNEL);
+	if (!opp_dev)
+		return NULL;
+
+	/* Initialize opp-dev */
+	opp_dev->dev = dev;
+	list_add(&opp_dev->node, &opp_table->dev_list);
+
+	/* Create debugfs entries for the opp_table */
+	ret = opp_debug_register(opp_dev, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp debugfs (%d)\n",
+			__func__, ret);
+
+	return opp_dev;
+}
+
+static struct opp_table *_allocate_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+	struct opp_device *opp_dev;
+	int ret;
+
+	/*
+	 * Allocate a new OPP table. In the infrequent case where a new
+	 * device is needed to be added, we pay this penalty.
+	 */
+	opp_table = kzalloc(sizeof(*opp_table), GFP_KERNEL);
+	if (!opp_table)
+		return NULL;
+
+	INIT_LIST_HEAD(&opp_table->dev_list);
+
+	opp_dev = _add_opp_dev(dev, opp_table);
+	if (!opp_dev) {
+		kfree(opp_table);
+		return NULL;
+	}
+
+	_of_init_opp_table(opp_table, dev);
+
+	/* Find clk for the device */
+	opp_table->clk = clk_get(dev, NULL);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
+		if (ret != -EPROBE_DEFER)
+			dev_dbg(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+	}
+
+	BLOCKING_INIT_NOTIFIER_HEAD(&opp_table->head);
+	INIT_LIST_HEAD(&opp_table->opp_list);
+	mutex_init(&opp_table->lock);
+	kref_init(&opp_table->kref);
+
+	/* Secure the device table modification */
+	list_add(&opp_table->node, &opp_tables);
+	return opp_table;
+}
+
+void _get_opp_table_kref(struct opp_table *opp_table)
+{
+	kref_get(&opp_table->kref);
+}
+
+struct opp_table *dev_pm_opp_get_opp_table(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	/* Hold our table modification lock here */
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _find_opp_table_unlocked(dev);
+	if (!IS_ERR(opp_table))
+		goto unlock;
+
+	opp_table = _allocate_opp_table(dev);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return opp_table;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_opp_table);
+
+static void _opp_table_kref_release(struct kref *kref)
+{
+	struct opp_table *opp_table = container_of(kref, struct opp_table, kref);
+	struct opp_device *opp_dev;
+
+	/* Release clk */
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
+
+	opp_dev = list_first_entry(&opp_table->dev_list, struct opp_device,
+				   node);
+
+	_remove_opp_dev(opp_dev, opp_table);
+
+	/* dev_list must be empty now */
+	WARN_ON(!list_empty(&opp_table->dev_list));
+
+	mutex_destroy(&opp_table->lock);
+	list_del(&opp_table->node);
+	kfree(opp_table);
+
+	mutex_unlock(&opp_table_lock);
+}
+
+void dev_pm_opp_put_opp_table(struct opp_table *opp_table)
+{
+	kref_put_mutex(&opp_table->kref, _opp_table_kref_release,
+		       &opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_opp_table);
+
+void _opp_free(struct dev_pm_opp *opp)
+{
+	kfree(opp);
+}
+
+static void _opp_kref_release(struct kref *kref)
+{
+	struct dev_pm_opp *opp = container_of(kref, struct dev_pm_opp, kref);
+	struct opp_table *opp_table = opp->opp_table;
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_REMOVE, opp);
+	opp_debug_remove_one(opp);
+	list_del(&opp->node);
+	kfree(opp);
+
+	mutex_unlock(&opp_table->lock);
+	dev_pm_opp_put_opp_table(opp_table);
+}
+
+static void dev_pm_opp_get(struct dev_pm_opp *opp)
+{
+	kref_get(&opp->kref);
+}
+
+void dev_pm_opp_put(struct dev_pm_opp *opp)
+{
+	kref_put_mutex(&opp->kref, _opp_kref_release, &opp->opp_table->lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put);
+
+/**
+ * dev_pm_opp_remove()  - Remove an OPP from OPP table
+ * @dev:	device for which we do this operation
+ * @freq:	OPP to remove with matching 'freq'
+ *
+ * This function removes an opp from the opp table.
+ */
+void dev_pm_opp_remove(struct device *dev, unsigned long freq)
+{
+	struct dev_pm_opp *opp;
+	struct opp_table *opp_table;
+	bool found = false;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return;
+
+	mutex_lock(&opp_table->lock);
+
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
+		if (opp->rate == freq) {
+			found = true;
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table->lock);
+
+	if (found) {
+		dev_pm_opp_put(opp);
+	} else {
+		dev_warn(dev, "%s: Couldn't find OPP with freq: %lu\n",
+			 __func__, freq);
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove);
+
+struct dev_pm_opp *_opp_allocate(struct opp_table *table)
+{
+	struct dev_pm_opp *opp;
+	int count, supply_size;
+
+	/* Allocate space for at least one supply */
+	count = table->regulator_count ? table->regulator_count : 1;
+	supply_size = sizeof(*opp->supplies) * count;
+
+	/* allocate new OPP node and supplies structures */
+	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
+	if (!opp)
+		return NULL;
+
+	/* Put the supplies at the end of the OPP structure as an empty array */
+	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+	INIT_LIST_HEAD(&opp->node);
+
+	return opp;
+}
+
+static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
+					 struct opp_table *opp_table)
+{
+	struct regulator *reg;
+	int i;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		reg = opp_table->regulators[i];
+
+		if (!regulator_is_supported_voltage(reg,
+					opp->supplies[i].u_volt_min,
+					opp->supplies[i].u_volt_max)) {
+			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+				__func__, opp->supplies[i].u_volt_min,
+				opp->supplies[i].u_volt_max);
+			return false;
+		}
+	}
+
+	return true;
+}
+
+/*
+ * Returns:
+ * 0: On success. And appropriate error message for duplicate OPPs.
+ * -EBUSY: For OPP with same freq/volt and is available. The callers of
+ *  _opp_add() must return 0 if they receive -EBUSY from it. This is to make
+ *  sure we don't print error messages unnecessarily if different parts of
+ *  kernel try to initialize the OPP table.
+ * -EEXIST: For OPP with same freq but different volt or is unavailable. This
+ *  should be considered an error by the callers of _opp_add().
+ */
+int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
+	     struct opp_table *opp_table)
+{
+	struct dev_pm_opp *opp;
+	struct list_head *head;
+	int ret;
+
+	/*
+	 * Insert new OPP in order of increasing frequency and discard if
+	 * already present.
+	 *
+	 * Need to use &opp_table->opp_list in the condition part of the 'for'
+	 * loop, don't replace it with head otherwise it will become an infinite
+	 * loop.
+	 */
+	mutex_lock(&opp_table->lock);
+	head = &opp_table->opp_list;
+
+	list_for_each_entry(opp, &opp_table->opp_list, node) {
+		if (new_opp->rate > opp->rate) {
+			head = &opp->node;
+			continue;
+		}
+
+		if (new_opp->rate < opp->rate)
+			break;
+
+		/* Duplicate OPPs */
+		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
+			 __func__, opp->rate, opp->supplies[0].u_volt,
+			 opp->available, new_opp->rate,
+			 new_opp->supplies[0].u_volt, new_opp->available);
+
+		/* Should we compare voltages for all regulators here ? */
+		ret = opp->available &&
+		      new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? -EBUSY : -EEXIST;
+
+		mutex_unlock(&opp_table->lock);
+		return ret;
+	}
+
+	list_add(&new_opp->node, head);
+	mutex_unlock(&opp_table->lock);
+
+	new_opp->opp_table = opp_table;
+	kref_init(&new_opp->kref);
+
+	/* Get a reference to the OPP table */
+	_get_opp_table_kref(opp_table);
+
+	ret = opp_debug_create_one(new_opp, opp_table);
+	if (ret)
+		dev_err(dev, "%s: Failed to register opp to debugfs (%d)\n",
+			__func__, ret);
+
+	if (!_opp_supported_by_regulators(new_opp, opp_table)) {
+		new_opp->available = false;
+		dev_warn(dev, "%s: OPP not supported by regulators (%lu)\n",
+			 __func__, new_opp->rate);
+	}
+
+	return 0;
+}
+
+/**
+ * _opp_add_v1() - Allocate a OPP based on v1 bindings.
+ * @opp_table:	OPP table
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ * @dynamic:	Dynamically added OPPs.
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions and may be removed by dev_pm_opp_remove.
+ *
+ * NOTE: "dynamic" parameter impacts OPPs added by the dev_pm_opp_of_add_table
+ * and freed by dev_pm_opp_of_remove_table.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+int _opp_add_v1(struct opp_table *opp_table, struct device *dev,
+		unsigned long freq, long u_volt, bool dynamic)
+{
+	struct dev_pm_opp *new_opp;
+	unsigned long tol;
+	int ret;
+
+	new_opp = _opp_allocate(opp_table);
+	if (!new_opp)
+		return -ENOMEM;
+
+	/* populate the opp table */
+	new_opp->rate = freq;
+	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
+	new_opp->supplies[0].u_volt = u_volt;
+	new_opp->supplies[0].u_volt_min = u_volt - tol;
+	new_opp->supplies[0].u_volt_max = u_volt + tol;
+	new_opp->available = true;
+	new_opp->dynamic = dynamic;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret) {
+		/* Don't return error for duplicate OPPs */
+		if (ret == -EBUSY)
+			ret = 0;
+		goto free_opp;
+	}
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_free(new_opp);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_set_supported_hw() - Set supported platforms
+ * @dev: Device for which supported-hw has to be set.
+ * @versions: Array of hierarchy of versions to match.
+ * @count: Number of elements in the array.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the hierarchy of versions it supports. OPP layer will then enable
+ * OPPs, which are available for those versions, based on its 'opp-supported-hw'
+ * property.
+ */
+struct opp_table *dev_pm_opp_set_supported_hw(struct device *dev,
+			const u32 *versions, unsigned int count)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a version hierarchy associated with opp_table? */
+	if (opp_table->supported_hw) {
+		dev_err(dev, "%s: Already have supported hardware list\n",
+			__func__);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->supported_hw = kmemdup(versions, count * sizeof(*versions),
+					GFP_KERNEL);
+	if (!opp_table->supported_hw) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	opp_table->supported_hw_count = count;
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_supported_hw);
+
+/**
+ * dev_pm_opp_put_supported_hw() - Releases resources blocked for supported hw
+ * @opp_table: OPP table returned by dev_pm_opp_set_supported_hw().
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_supported_hw(). Until this is called, the opp_table structure
+ * will not be freed.
+ */
+void dev_pm_opp_put_supported_hw(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->supported_hw) {
+		pr_err("%s: Doesn't have supported hardware list\n",
+		       __func__);
+		return;
+	}
+
+	kfree(opp_table->supported_hw);
+	opp_table->supported_hw = NULL;
+	opp_table->supported_hw_count = 0;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_supported_hw);
+
+/**
+ * dev_pm_opp_set_prop_name() - Set prop-extn name
+ * @dev: Device for which the prop-name has to be set.
+ * @name: name to postfix to properties.
+ *
+ * This is required only for the V2 bindings, and it enables a platform to
+ * specify the extn to be used for certain property names. The properties to
+ * which the extension will apply are opp-microvolt and opp-microamp. OPP core
+ * should postfix the property name with -<name> while looking for them.
+ */
+struct opp_table *dev_pm_opp_set_prop_name(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	/* Do we already have a prop-name associated with opp_table? */
+	if (opp_table->prop_name) {
+		dev_err(dev, "%s: Already have prop-name %s\n", __func__,
+			opp_table->prop_name);
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->prop_name = kstrdup(name, GFP_KERNEL);
+	if (!opp_table->prop_name) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_prop_name);
+
+/**
+ * dev_pm_opp_put_prop_name() - Releases resources blocked for prop-name
+ * @opp_table: OPP table returned by dev_pm_opp_set_prop_name().
+ *
+ * This is required only for the V2 bindings, and is called for a matching
+ * dev_pm_opp_set_prop_name(). Until this is called, the opp_table structure
+ * will not be freed.
+ */
+void dev_pm_opp_put_prop_name(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	if (!opp_table->prop_name) {
+		pr_err("%s: Doesn't have a prop-name\n", __func__);
+		return;
+	}
+
+	kfree(opp_table->prop_name);
+	opp_table->prop_name = NULL;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
+
+static int _allocate_set_opp_data(struct opp_table *opp_table)
+{
+	struct dev_pm_set_opp_data *data;
+	int len, count = opp_table->regulator_count;
+
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	/* space for set_opp_data */
+	len = sizeof(*data);
+
+	/* space for old_opp.supplies and new_opp.supplies */
+	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
+
+	data = kzalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->old_opp.supplies = (void *)(data + 1);
+	data->new_opp.supplies = data->old_opp.supplies + count;
+
+	opp_table->set_opp_data = data;
+
+	return 0;
+}
+
+static void _free_set_opp_data(struct opp_table *opp_table)
+{
+	kfree(opp_table->set_opp_data);
+	opp_table->set_opp_data = NULL;
+}
+
+/**
+ * dev_pm_opp_set_regulators() - Set regulator names for the device
+ * @dev: Device for which regulator name is being set.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
+ *
+ * In order to support OPP switching, OPP layer needs to know the name of the
+ * device's regulators, as the core would be required to switch voltages as
+ * well.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[],
+					    unsigned int count)
+{
+	struct opp_table *opp_table;
+	struct regulator *reg;
+	int ret, i;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have regulators set */
+	if (opp_table->regulators) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->regulators = kmalloc_array(count,
+					      sizeof(*opp_table->regulators),
+					      GFP_KERNEL);
+	if (!opp_table->regulators) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < count; i++) {
+		reg = regulator_get_optional(dev, names[i]);
+		if (IS_ERR(reg)) {
+			ret = PTR_ERR(reg);
+			if (ret != -EPROBE_DEFER)
+				dev_err(dev, "%s: no regulator (%s) found: %d\n",
+					__func__, names[i], ret);
+			goto free_regulators;
+		}
+
+		opp_table->regulators[i] = reg;
+	}
+
+	opp_table->regulator_count = count;
+
+	/* Allocate block only once to pass to set_opp() routines */
+	ret = _allocate_set_opp_data(opp_table);
+	if (ret)
+		goto free_regulators;
+
+	return opp_table;
+
+free_regulators:
+	while (i != 0)
+		regulator_put(opp_table->regulators[--i]);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
+
+/**
+ * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
+ */
+void dev_pm_opp_put_regulators(struct opp_table *opp_table)
+{
+	int i;
+
+	if (!opp_table->regulators) {
+		pr_err("%s: Doesn't have regulators set\n", __func__);
+		return;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	for (i = opp_table->regulator_count - 1; i >= 0; i--)
+		regulator_put(opp_table->regulators[i]);
+
+	_free_set_opp_data(opp_table);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
+
+/**
+ * dev_pm_opp_set_clkname() - Set clk name for the device
+ * @dev: Device for which clk name is being set.
+ * @name: Clk name.
+ *
+ * In order to support OPP switching, OPP layer needs to get pointer to the
+ * clock for the device. Simple cases work fine without using this routine (i.e.
+ * by passing connection-id as NULL), but for a device with multiple clocks
+ * available, the OPP core needs to know the exact name of the clk to use.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_set_clkname(struct device *dev, const char *name)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have default clk set, free it */
+	if (!IS_ERR(opp_table->clk))
+		clk_put(opp_table->clk);
+
+	/* Find clk for the device */
+	opp_table->clk = clk_get(dev, name);
+	if (IS_ERR(opp_table->clk)) {
+		ret = PTR_ERR(opp_table->clk);
+		if (ret != -EPROBE_DEFER) {
+			dev_err(dev, "%s: Couldn't find clock: %d\n", __func__,
+				ret);
+		}
+		goto err;
+	}
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_clkname);
+
+/**
+ * dev_pm_opp_put_clkname() - Releases resources blocked for clk.
+ * @opp_table: OPP table returned from dev_pm_opp_set_clkname().
+ */
+void dev_pm_opp_put_clkname(struct opp_table *opp_table)
+{
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	clk_put(opp_table->clk);
+	opp_table->clk = ERR_PTR(-EINVAL);
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_clkname);
+
+/**
+ * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is useful to support complex platforms (like platforms with multiple
+ * regulators per device), instead of the generic OPP set rate helper.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ */
+struct opp_table *dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	if (!set_opp)
+		return ERR_PTR(-EINVAL);
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return ERR_PTR(-ENOMEM);
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have custom set_opp helper */
+	if (WARN_ON(opp_table->set_opp)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->set_opp = set_opp;
+
+	return opp_table;
+
+err:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
+
+/**
+ * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ *					   set_opp helper
+ * @opp_table: OPP table returned from dev_pm_opp_register_set_opp_helper().
+ *
+ * Release resources blocked for platform specific set_opp helper.
+ */
+void dev_pm_opp_register_put_opp_helper(struct opp_table *opp_table)
+{
+	if (!opp_table->set_opp) {
+		pr_err("%s: Doesn't have custom set_opp helper set\n",
+		       __func__);
+		return;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	opp_table->set_opp = NULL;
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
+
+/**
+ * dev_pm_opp_add()  - Add an OPP table from a table definitions
+ * @dev:	device for which we do this operation
+ * @freq:	Frequency in Hz for this OPP
+ * @u_volt:	Voltage in uVolts for this OPP
+ *
+ * This function adds an opp definition to the opp table and returns status.
+ * The opp is made available by default and it can be controlled using
+ * dev_pm_opp_enable/disable functions.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ */
+int dev_pm_opp_add(struct device *dev, unsigned long freq, unsigned long u_volt)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	ret = _opp_add_v1(opp_table, dev, freq, u_volt, true);
+
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_add);
+
+/**
+ * _opp_set_availability() - helper to set the availability of an opp
+ * @dev:		device for which we do this operation
+ * @freq:		OPP frequency to modify availability
+ * @availability_req:	availability status requested for this opp
+ *
+ * Set the availability of an OPP, opp_{enable,disable} share a common logic
+ * which is isolated here.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+static int _opp_set_availability(struct device *dev, unsigned long freq,
+				 bool availability_req)
+{
+	struct opp_table *opp_table;
+	struct dev_pm_opp *tmp_opp, *opp = ERR_PTR(-ENODEV);
+	int r = 0;
+
+	/* Find the opp_table */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		r = PTR_ERR(opp_table);
+		dev_warn(dev, "%s: Device OPP not found (%d)\n", __func__, r);
+		return r;
+	}
+
+	mutex_lock(&opp_table->lock);
+
+	/* Do we have the frequency? */
+	list_for_each_entry(tmp_opp, &opp_table->opp_list, node) {
+		if (tmp_opp->rate == freq) {
+			opp = tmp_opp;
+			break;
+		}
+	}
+
+	if (IS_ERR(opp)) {
+		r = PTR_ERR(opp);
+		goto unlock;
+	}
+
+	/* Is update really needed? */
+	if (opp->available == availability_req)
+		goto unlock;
+
+	opp->available = availability_req;
+
+	dev_pm_opp_get(opp);
+	mutex_unlock(&opp_table->lock);
+
+	/* Notify the change of the OPP availability */
+	if (availability_req)
+		blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ENABLE,
+					     opp);
+	else
+		blocking_notifier_call_chain(&opp_table->head,
+					     OPP_EVENT_DISABLE, opp);
+
+	dev_pm_opp_put(opp);
+	goto put_table;
+
+unlock:
+	mutex_unlock(&opp_table->lock);
+put_table:
+	dev_pm_opp_put_opp_table(opp_table);
+	return r;
+}
+
+/**
+ * dev_pm_opp_enable() - Enable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to enable
+ *
+ * Enables a provided opp. If the operation is valid, this returns 0, else the
+ * corresponding error value. It is meant to be used for users an OPP available
+ * after being temporarily made unavailable with dev_pm_opp_disable.
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_enable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
+
+/**
+ * dev_pm_opp_disable() - Disable a specific OPP
+ * @dev:	device for which we do this operation
+ * @freq:	OPP frequency to disable
+ *
+ * Disables a provided opp. If the operation is valid, this returns
+ * 0, else the corresponding error value. It is meant to be a temporary
+ * control by users to make this OPP not available until the circumstances are
+ * right to make it available again (with a call to dev_pm_opp_enable).
+ *
+ * Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
+ * copy operation, returns 0 if no modification was done OR modification was
+ * successful.
+ */
+int dev_pm_opp_disable(struct device *dev, unsigned long freq)
+{
+	return _opp_set_availability(dev, freq, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_disable);
+
+/**
+ * dev_pm_opp_register_notifier() - Register OPP notifier for the device
+ * @dev:	Device for which notifier needs to be registered
+ * @nb:		Notifier block to be registered
+ *
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_register_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	ret = blocking_notifier_chain_register(&opp_table->head, nb);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_pm_opp_register_notifier);
+
+/**
+ * dev_pm_opp_unregister_notifier() - Unregister OPP notifier for the device
+ * @dev:	Device for which notifier needs to be unregistered
+ * @nb:		Notifier block to be unregistered
+ *
+ * Return: 0 on success or a negative error value.
+ */
+int dev_pm_opp_unregister_notifier(struct device *dev,
+				   struct notifier_block *nb)
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	ret = blocking_notifier_chain_unregister(&opp_table->head, nb);
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL(dev_pm_opp_unregister_notifier);
+
+/*
+ * Free OPPs either created using static entries present in DT or even the
+ * dynamically added entries based on remove_all param.
+ */
+void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev,
+			      bool remove_all)
+{
+	struct dev_pm_opp *opp, *tmp;
+
+	/* Find if opp_table manages a single device */
+	if (list_is_singular(&opp_table->dev_list)) {
+		/* Free static OPPs */
+		list_for_each_entry_safe(opp, tmp, &opp_table->opp_list, node) {
+			if (remove_all || !opp->dynamic)
+				dev_pm_opp_put(opp);
+		}
+	} else {
+		_remove_opp_dev(_find_opp_dev(dev, opp_table), opp_table);
+	}
+}
+
+void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all)
+{
+	struct opp_table *opp_table;
+
+	/* Check for existing table for 'dev' */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		int error = PTR_ERR(opp_table);
+
+		if (error != -ENODEV)
+			WARN(1, "%s: opp_table: %d\n",
+			     IS_ERR_OR_NULL(dev) ?
+					"Invalid device" : dev_name(dev),
+			     error);
+		return;
+	}
+
+	_dev_pm_opp_remove_table(opp_table, dev, remove_all);
+
+	dev_pm_opp_put_opp_table(opp_table);
+}
+
+/**
+ * dev_pm_opp_remove_table() - Free all OPPs associated with the device
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Free both OPPs created using static entries present in DT and the
+ * dynamically added entries.
+ */
+void dev_pm_opp_remove_table(struct device *dev)
+{
+	_dev_pm_opp_find_and_remove_table(dev, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_remove_table);
diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c
new file mode 100644
index 000000000000..2d87bc1adf38
--- /dev/null
+++ b/drivers/opp/cpu.c
@@ -0,0 +1,236 @@
+/*
+ * Generic OPP helper interface for CPU device
+ *
+ * Copyright (C) 2009-2014 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/cpufreq.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+#ifdef CONFIG_CPU_FREQ
+
+/**
+ * dev_pm_opp_init_cpufreq_table() - create a cpufreq table for a device
+ * @dev:	device for which we do this operation
+ * @table:	Cpufreq table returned back to caller
+ *
+ * Generate a cpufreq table for a provided device- this assumes that the
+ * opp table is already initialized and ready for usage.
+ *
+ * This function allocates required memory for the cpufreq table. It is
+ * expected that the caller does the required maintenance such as freeing
+ * the table as required.
+ *
+ * Returns -EINVAL for bad pointers, -ENODEV if the device is not found, -ENOMEM
+ * if no memory available for the operation (table is not populated), returns 0
+ * if successful and table is populated.
+ *
+ * WARNING: It is  important for the callers to ensure refreshing their copy of
+ * the table if any of the mentioned functions have been invoked in the interim.
+ */
+int dev_pm_opp_init_cpufreq_table(struct device *dev,
+				  struct cpufreq_frequency_table **table)
+{
+	struct dev_pm_opp *opp;
+	struct cpufreq_frequency_table *freq_table = NULL;
+	int i, max_opps, ret = 0;
+	unsigned long rate;
+
+	max_opps = dev_pm_opp_get_opp_count(dev);
+	if (max_opps <= 0)
+		return max_opps ? max_opps : -ENODATA;
+
+	freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC);
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (i = 0, rate = 0; i < max_opps; i++, rate++) {
+		/* find next rate */
+		opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+		if (IS_ERR(opp)) {
+			ret = PTR_ERR(opp);
+			goto out;
+		}
+		freq_table[i].driver_data = i;
+		freq_table[i].frequency = rate / 1000;
+
+		/* Is Boost/turbo opp ? */
+		if (dev_pm_opp_is_turbo(opp))
+			freq_table[i].flags = CPUFREQ_BOOST_FREQ;
+
+		dev_pm_opp_put(opp);
+	}
+
+	freq_table[i].driver_data = i;
+	freq_table[i].frequency = CPUFREQ_TABLE_END;
+
+	*table = &freq_table[0];
+
+out:
+	if (ret)
+		kfree(freq_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_init_cpufreq_table);
+
+/**
+ * dev_pm_opp_free_cpufreq_table() - free the cpufreq table
+ * @dev:	device for which we do this operation
+ * @table:	table to free
+ *
+ * Free up the table allocated by dev_pm_opp_init_cpufreq_table
+ */
+void dev_pm_opp_free_cpufreq_table(struct device *dev,
+				   struct cpufreq_frequency_table **table)
+{
+	if (!table)
+		return;
+
+	kfree(*table);
+	*table = NULL;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_free_cpufreq_table);
+#endif	/* CONFIG_CPU_FREQ */
+
+void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of)
+{
+	struct device *cpu_dev;
+	int cpu;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		if (of)
+			dev_pm_opp_of_remove_table(cpu_dev);
+		else
+			dev_pm_opp_remove_table(cpu_dev);
+	}
+}
+
+/**
+ * dev_pm_opp_cpumask_remove_table() - Removes OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be removed
+ *
+ * This removes the OPP tables for CPUs present in the @cpumask.
+ * This should be used to remove all the OPPs entries associated with
+ * the cpus in @cpumask.
+ */
+void dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask)
+{
+	_dev_pm_opp_cpumask_remove_table(cpumask, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_cpumask_remove_table);
+
+/**
+ * dev_pm_opp_set_sharing_cpus() - Mark OPP table as shared by few CPUs
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask of the CPUs which share the OPP table with @cpu_dev
+ *
+ * This marks OPP table of the @cpu_dev as shared by the CPUs present in
+ * @cpumask.
+ *
+ * Returns -ENODEV if OPP table isn't already present.
+ */
+int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev,
+				const struct cpumask *cpumask)
+{
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
+	struct device *dev;
+	int cpu, ret = 0;
+
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	for_each_cpu(cpu, cpumask) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		dev = get_cpu_device(cpu);
+		if (!dev) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		opp_dev = _add_opp_dev(dev, opp_table);
+		if (!opp_dev) {
+			dev_err(dev, "%s: failed to add opp-dev for cpu%d device\n",
+				__func__, cpu);
+			continue;
+		}
+
+		/* Mark opp-table as multiple CPUs are sharing it now */
+		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_sharing_cpus);
+
+/**
+ * dev_pm_opp_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with @cpu_dev
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask to update with information of sharing CPUs
+ *
+ * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
+ *
+ * Returns -ENODEV if OPP table isn't already present and -EINVAL if the OPP
+ * table's status is access-unknown.
+ */
+int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask)
+{
+	struct opp_device *opp_dev;
+	struct opp_table *opp_table;
+	int ret = 0;
+
+	opp_table = _find_opp_table(cpu_dev);
+	if (IS_ERR(opp_table))
+		return PTR_ERR(opp_table);
+
+	if (opp_table->shared_opp == OPP_TABLE_ACCESS_UNKNOWN) {
+		ret = -EINVAL;
+		goto put_opp_table;
+	}
+
+	cpumask_clear(cpumask);
+
+	if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
+		list_for_each_entry(opp_dev, &opp_table->dev_list, node)
+			cpumask_set_cpu(opp_dev->dev->id, cpumask);
+	} else {
+		cpumask_set_cpu(cpu_dev->id, cpumask);
+	}
+
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_get_sharing_cpus);
diff --git a/drivers/opp/debugfs.c b/drivers/opp/debugfs.c
new file mode 100644
index 000000000000..81cf120fcf43
--- /dev/null
+++ b/drivers/opp/debugfs.c
@@ -0,0 +1,249 @@
+/*
+ * Generic OPP debugfs interface
+ *
+ * Copyright (C) 2015-2016 Viresh Kumar <viresh.kumar@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/limits.h>
+#include <linux/slab.h>
+
+#include "opp.h"
+
+static struct dentry *rootdir;
+
+static void opp_set_dev_name(const struct device *dev, char *name)
+{
+	if (dev->parent)
+		snprintf(name, NAME_MAX, "%s-%s", dev_name(dev->parent),
+			 dev_name(dev));
+	else
+		snprintf(name, NAME_MAX, "%s", dev_name(dev));
+}
+
+void opp_debug_remove_one(struct dev_pm_opp *opp)
+{
+	debugfs_remove_recursive(opp->dentry);
+}
+
+static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+				      struct opp_table *opp_table,
+				      struct dentry *pdentry)
+{
+	struct dentry *d;
+	int i;
+	char *name;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		name = kasprintf(GFP_KERNEL, "supply-%d", i);
+
+		/* Create per-opp directory */
+		d = debugfs_create_dir(name, pdentry);
+
+		kfree(name);
+
+		if (!d)
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+					  &opp->supplies[i].u_volt))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_min))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_max))
+			return false;
+
+		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
+					  &opp->supplies[i].u_amp))
+			return false;
+	}
+
+	return true;
+}
+
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
+{
+	struct dentry *pdentry = opp_table->dentry;
+	struct dentry *d;
+	char name[25];	/* 20 chars for 64 bit value + 5 (opp:\0) */
+
+	/* Rate is unique to each OPP, use it to give opp-name */
+	snprintf(name, sizeof(name), "opp:%lu", opp->rate);
+
+	/* Create per-opp directory */
+	d = debugfs_create_dir(name, pdentry);
+	if (!d)
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("available", S_IRUGO, d, &opp->available))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("dynamic", S_IRUGO, d, &opp->dynamic))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("turbo", S_IRUGO, d, &opp->turbo))
+		return -ENOMEM;
+
+	if (!debugfs_create_bool("suspend", S_IRUGO, d, &opp->suspend))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
+		return -ENOMEM;
+
+	if (!opp_debug_create_supplies(opp, opp_table, d))
+		return -ENOMEM;
+
+	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
+				  &opp->clock_latency_ns))
+		return -ENOMEM;
+
+	opp->dentry = d;
+	return 0;
+}
+
+static int opp_list_debug_create_dir(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	struct dentry *d;
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	/* Create device specific directory */
+	d = debugfs_create_dir(opp_table->dentry_name, rootdir);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create debugfs dir\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+	opp_table->dentry = d;
+
+	return 0;
+}
+
+static int opp_list_debug_create_link(struct opp_device *opp_dev,
+				      struct opp_table *opp_table)
+{
+	const struct device *dev = opp_dev->dev;
+	char name[NAME_MAX];
+	struct dentry *d;
+
+	opp_set_dev_name(opp_dev->dev, name);
+
+	/* Create device specific directory link */
+	d = debugfs_create_symlink(name, rootdir, opp_table->dentry_name);
+	if (!d) {
+		dev_err(dev, "%s: Failed to create link\n", __func__);
+		return -ENOMEM;
+	}
+
+	opp_dev->dentry = d;
+
+	return 0;
+}
+
+/**
+ * opp_debug_register - add a device opp node to the debugfs 'opp' directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being added
+ *
+ * Dynamically adds device specific directory in debugfs 'opp' directory. If the
+ * device-opp is shared with other devices, then links will be created for all
+ * devices except the first.
+ *
+ * Return: 0 on success, otherwise negative error.
+ */
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table)
+{
+	if (!rootdir) {
+		pr_debug("%s: Uninitialized rootdir\n", __func__);
+		return -EINVAL;
+	}
+
+	if (opp_table->dentry)
+		return opp_list_debug_create_link(opp_dev, opp_table);
+
+	return opp_list_debug_create_dir(opp_dev, opp_table);
+}
+
+static void opp_migrate_dentry(struct opp_device *opp_dev,
+			       struct opp_table *opp_table)
+{
+	struct opp_device *new_dev;
+	const struct device *dev;
+	struct dentry *dentry;
+
+	/* Look for next opp-dev */
+	list_for_each_entry(new_dev, &opp_table->dev_list, node)
+		if (new_dev != opp_dev)
+			break;
+
+	/* new_dev is guaranteed to be valid here */
+	dev = new_dev->dev;
+	debugfs_remove_recursive(new_dev->dentry);
+
+	opp_set_dev_name(dev, opp_table->dentry_name);
+
+	dentry = debugfs_rename(rootdir, opp_dev->dentry, rootdir,
+				opp_table->dentry_name);
+	if (!dentry) {
+		dev_err(dev, "%s: Failed to rename link from: %s to %s\n",
+			__func__, dev_name(opp_dev->dev), dev_name(dev));
+		return;
+	}
+
+	new_dev->dentry = dentry;
+	opp_table->dentry = dentry;
+}
+
+/**
+ * opp_debug_unregister - remove a device opp node from debugfs opp directory
+ * @opp_dev: opp-dev pointer for device
+ * @opp_table: the device-opp being removed
+ *
+ * Dynamically removes device specific directory from debugfs 'opp' directory.
+ */
+void opp_debug_unregister(struct opp_device *opp_dev,
+			  struct opp_table *opp_table)
+{
+	if (opp_dev->dentry == opp_table->dentry) {
+		/* Move the real dentry object under another device */
+		if (!list_is_singular(&opp_table->dev_list)) {
+			opp_migrate_dentry(opp_dev, opp_table);
+			goto out;
+		}
+		opp_table->dentry = NULL;
+	}
+
+	debugfs_remove_recursive(opp_dev->dentry);
+
+out:
+	opp_dev->dentry = NULL;
+}
+
+static int __init opp_debug_init(void)
+{
+	/* Create /sys/kernel/debug/opp directory */
+	rootdir = debugfs_create_dir("opp", NULL);
+	if (!rootdir) {
+		pr_err("%s: Failed to create root directory\n", __func__);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+core_initcall(opp_debug_init);
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
new file mode 100644
index 000000000000..0b718886479b
--- /dev/null
+++ b/drivers/opp/of.c
@@ -0,0 +1,633 @@
+/*
+ * Generic OPP OF helpers
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/device.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+#include "opp.h"
+
+static struct opp_table *_managed_opp(const struct device_node *np)
+{
+	struct opp_table *opp_table, *managed_table = NULL;
+
+	mutex_lock(&opp_table_lock);
+
+	list_for_each_entry(opp_table, &opp_tables, node) {
+		if (opp_table->np == np) {
+			/*
+			 * Multiple devices can point to the same OPP table and
+			 * so will have same node-pointer, np.
+			 *
+			 * But the OPPs will be considered as shared only if the
+			 * OPP table contains a "opp-shared" property.
+			 */
+			if (opp_table->shared_opp == OPP_TABLE_ACCESS_SHARED) {
+				_get_opp_table_kref(opp_table);
+				managed_table = opp_table;
+			}
+
+			break;
+		}
+	}
+
+	mutex_unlock(&opp_table_lock);
+
+	return managed_table;
+}
+
+void _of_init_opp_table(struct opp_table *opp_table, struct device *dev)
+{
+	struct device_node *np;
+
+	/*
+	 * Only required for backward compatibility with v1 bindings, but isn't
+	 * harmful for other cases. And so we do it unconditionally.
+	 */
+	np = of_node_get(dev->of_node);
+	if (np) {
+		u32 val;
+
+		if (!of_property_read_u32(np, "clock-latency", &val))
+			opp_table->clock_latency_ns_max = val;
+		of_property_read_u32(np, "voltage-tolerance",
+				     &opp_table->voltage_tolerance_v1);
+		of_node_put(np);
+	}
+}
+
+static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
+			      struct device_node *np)
+{
+	unsigned int count = opp_table->supported_hw_count;
+	u32 version;
+	int ret;
+
+	if (!opp_table->supported_hw) {
+		/*
+		 * In the case that no supported_hw has been set by the
+		 * platform but there is an opp-supported-hw value set for
+		 * an OPP then the OPP should not be enabled as there is
+		 * no way to see if the hardware supports it.
+		 */
+		if (of_find_property(np, "opp-supported-hw", NULL))
+			return false;
+		else
+			return true;
+	}
+
+	while (count--) {
+		ret = of_property_read_u32_index(np, "opp-supported-hw", count,
+						 &version);
+		if (ret) {
+			dev_warn(dev, "%s: failed to read opp-supported-hw property at index %d: %d\n",
+				 __func__, count, ret);
+			return false;
+		}
+
+		/* Both of these are bitwise masks of the versions */
+		if (!(version & opp_table->supported_hw[count]))
+			return false;
+	}
+
+	return true;
+}
+
+static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
+			      struct opp_table *opp_table)
+{
+	u32 *microvolt, *microamp = NULL;
+	int supplies, vcount, icount, ret, i, j;
+	struct property *prop = NULL;
+	char name[NAME_MAX];
+
+	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
+
+	/* Search for "opp-microvolt-<name>" */
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microvolt-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microvolt" */
+		sprintf(name, "opp-microvolt");
+		prop = of_find_property(opp->np, name, NULL);
+
+		/* Missing property isn't a problem, but an invalid entry is */
+		if (!prop) {
+			if (!opp_table->regulator_count)
+				return 0;
+
+			dev_err(dev, "%s: opp-microvolt missing although OPP managing regulators\n",
+				__func__);
+			return -EINVAL;
+		}
+	}
+
+	vcount = of_property_count_u32_elems(opp->np, name);
+	if (vcount < 0) {
+		dev_err(dev, "%s: Invalid %s property (%d)\n",
+			__func__, name, vcount);
+		return vcount;
+	}
+
+	/* There can be one or three elements per supply */
+	if (vcount != supplies && vcount != supplies * 3) {
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+			__func__, name, vcount, supplies);
+		return -EINVAL;
+	}
+
+	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
+	if (!microvolt)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
+	if (ret) {
+		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
+		ret = -EINVAL;
+		goto free_microvolt;
+	}
+
+	/* Search for "opp-microamp-<name>" */
+	prop = NULL;
+	if (opp_table->prop_name) {
+		snprintf(name, sizeof(name), "opp-microamp-%s",
+			 opp_table->prop_name);
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (!prop) {
+		/* Search for "opp-microamp" */
+		sprintf(name, "opp-microamp");
+		prop = of_find_property(opp->np, name, NULL);
+	}
+
+	if (prop) {
+		icount = of_property_count_u32_elems(opp->np, name);
+		if (icount < 0) {
+			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+				name, icount);
+			ret = icount;
+			goto free_microvolt;
+		}
+
+		if (icount != supplies) {
+			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+				__func__, name, icount, supplies);
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
+		if (!microamp) {
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		ret = of_property_read_u32_array(opp->np, name, microamp,
+						 icount);
+		if (ret) {
+			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+				name, ret);
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+	}
+
+	for (i = 0, j = 0; i < supplies; i++) {
+		opp->supplies[i].u_volt = microvolt[j++];
+
+		if (vcount == supplies) {
+			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
+			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
+		} else {
+			opp->supplies[i].u_volt_min = microvolt[j++];
+			opp->supplies[i].u_volt_max = microvolt[j++];
+		}
+
+		if (microamp)
+			opp->supplies[i].u_amp = microamp[i];
+	}
+
+free_microamp:
+	kfree(microamp);
+free_microvolt:
+	kfree(microvolt);
+
+	return ret;
+}
+
+/**
+ * dev_pm_opp_of_remove_table() - Free OPP table entries created from static DT
+ *				  entries
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Free OPPs created using static entries present in DT.
+ */
+void dev_pm_opp_of_remove_table(struct device *dev)
+{
+	_dev_pm_opp_find_and_remove_table(dev, false);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
+
+/* Returns opp descriptor node for a device node, caller must
+ * do of_node_put() */
+static struct device_node *_opp_of_get_opp_desc_node(struct device_node *np)
+{
+	/*
+	 * There should be only ONE phandle present in "operating-points-v2"
+	 * property.
+	 */
+
+	return of_parse_phandle(np, "operating-points-v2", 0);
+}
+
+/* Returns opp descriptor node for a device, caller must do of_node_put() */
+struct device_node *dev_pm_opp_of_get_opp_desc_node(struct device *dev)
+{
+	return _opp_of_get_opp_desc_node(dev->of_node);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_opp_desc_node);
+
+/**
+ * _opp_add_static_v2() - Allocate static OPPs (As per 'v2' DT bindings)
+ * @opp_table:	OPP table
+ * @dev:	device for which we do this operation
+ * @np:		device node
+ *
+ * This function adds an opp definition to the opp table and returns status. The
+ * opp can be controlled using dev_pm_opp_enable/disable functions and may be
+ * removed by dev_pm_opp_remove.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -EINVAL	Failed parsing the OPP node
+ */
+static int _opp_add_static_v2(struct opp_table *opp_table, struct device *dev,
+			      struct device_node *np)
+{
+	struct dev_pm_opp *new_opp;
+	u64 rate;
+	u32 val;
+	int ret;
+
+	new_opp = _opp_allocate(opp_table);
+	if (!new_opp)
+		return -ENOMEM;
+
+	ret = of_property_read_u64(np, "opp-hz", &rate);
+	if (ret < 0) {
+		dev_err(dev, "%s: opp-hz not found\n", __func__);
+		goto free_opp;
+	}
+
+	/* Check if the OPP supports hardware's hierarchy of versions or not */
+	if (!_opp_is_supported(dev, opp_table, np)) {
+		dev_dbg(dev, "OPP not supported by hardware: %llu\n", rate);
+		goto free_opp;
+	}
+
+	/*
+	 * Rate is defined as an unsigned long in clk API, and so casting
+	 * explicitly to its type. Must be fixed once rate is 64 bit
+	 * guaranteed in clk API.
+	 */
+	new_opp->rate = (unsigned long)rate;
+	new_opp->turbo = of_property_read_bool(np, "turbo-mode");
+
+	new_opp->np = np;
+	new_opp->dynamic = false;
+	new_opp->available = true;
+
+	if (!of_property_read_u32(np, "clock-latency-ns", &val))
+		new_opp->clock_latency_ns = val;
+
+	ret = opp_parse_supplies(new_opp, dev, opp_table);
+	if (ret)
+		goto free_opp;
+
+	ret = _opp_add(dev, new_opp, opp_table);
+	if (ret) {
+		/* Don't return error for duplicate OPPs */
+		if (ret == -EBUSY)
+			ret = 0;
+		goto free_opp;
+	}
+
+	/* OPP to select on device suspend */
+	if (of_property_read_bool(np, "opp-suspend")) {
+		if (opp_table->suspend_opp) {
+			dev_warn(dev, "%s: Multiple suspend OPPs found (%lu %lu)\n",
+				 __func__, opp_table->suspend_opp->rate,
+				 new_opp->rate);
+		} else {
+			new_opp->suspend = true;
+			opp_table->suspend_opp = new_opp;
+		}
+	}
+
+	if (new_opp->clock_latency_ns > opp_table->clock_latency_ns_max)
+		opp_table->clock_latency_ns_max = new_opp->clock_latency_ns;
+
+	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
+		 __func__, new_opp->turbo, new_opp->rate,
+		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
+		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
+
+	/*
+	 * Notify the changes in the availability of the operable
+	 * frequency/voltage list.
+	 */
+	blocking_notifier_call_chain(&opp_table->head, OPP_EVENT_ADD, new_opp);
+	return 0;
+
+free_opp:
+	_opp_free(new_opp);
+
+	return ret;
+}
+
+/* Initializes OPP tables based on new bindings */
+static int _of_add_opp_table_v2(struct device *dev, struct device_node *opp_np)
+{
+	struct device_node *np;
+	struct opp_table *opp_table;
+	int ret = 0, count = 0;
+
+	opp_table = _managed_opp(opp_np);
+	if (opp_table) {
+		/* OPPs are already managed */
+		if (!_add_opp_dev(dev, opp_table))
+			ret = -ENOMEM;
+		goto put_opp_table;
+	}
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	/* We have opp-table node now, iterate over it and add OPPs */
+	for_each_available_child_of_node(opp_np, np) {
+		count++;
+
+		ret = _opp_add_static_v2(opp_table, dev, np);
+		if (ret) {
+			dev_err(dev, "%s: Failed to add OPP, %d\n", __func__,
+				ret);
+			_dev_pm_opp_remove_table(opp_table, dev, false);
+			goto put_opp_table;
+		}
+	}
+
+	/* There should be one of more OPP defined */
+	if (WARN_ON(!count)) {
+		ret = -ENOENT;
+		goto put_opp_table;
+	}
+
+	opp_table->np = opp_np;
+	if (of_property_read_bool(opp_np, "opp-shared"))
+		opp_table->shared_opp = OPP_TABLE_ACCESS_SHARED;
+	else
+		opp_table->shared_opp = OPP_TABLE_ACCESS_EXCLUSIVE;
+
+put_opp_table:
+	dev_pm_opp_put_opp_table(opp_table);
+
+	return ret;
+}
+
+/* Initializes OPP tables based on old-deprecated bindings */
+static int _of_add_opp_table_v1(struct device *dev)
+{
+	struct opp_table *opp_table;
+	const struct property *prop;
+	const __be32 *val;
+	int nr, ret = 0;
+
+	prop = of_find_property(dev->of_node, "operating-points", NULL);
+	if (!prop)
+		return -ENODEV;
+	if (!prop->value)
+		return -ENODATA;
+
+	/*
+	 * Each OPP is a set of tuples consisting of frequency and
+	 * voltage like <freq-kHz vol-uV>.
+	 */
+	nr = prop->length / sizeof(u32);
+	if (nr % 2) {
+		dev_err(dev, "%s: Invalid OPP table\n", __func__);
+		return -EINVAL;
+	}
+
+	opp_table = dev_pm_opp_get_opp_table(dev);
+	if (!opp_table)
+		return -ENOMEM;
+
+	val = prop->value;
+	while (nr) {
+		unsigned long freq = be32_to_cpup(val++) * 1000;
+		unsigned long volt = be32_to_cpup(val++);
+
+		ret = _opp_add_v1(opp_table, dev, freq, volt, false);
+		if (ret) {
+			dev_err(dev, "%s: Failed to add OPP %ld (%d)\n",
+				__func__, freq, ret);
+			_dev_pm_opp_remove_table(opp_table, dev, false);
+			break;
+		}
+		nr -= 2;
+	}
+
+	dev_pm_opp_put_opp_table(opp_table);
+	return ret;
+}
+
+/**
+ * dev_pm_opp_of_add_table() - Initialize opp table from device tree
+ * @dev:	device pointer used to lookup OPP table.
+ *
+ * Register the initial OPP table with the OPP library for given device.
+ *
+ * Return:
+ * 0		On success OR
+ *		Duplicate OPPs (both freq and volt are same) and opp->available
+ * -EEXIST	Freq are same and volt are different OR
+ *		Duplicate OPPs (both freq and volt are same) and !opp->available
+ * -ENOMEM	Memory allocation failure
+ * -ENODEV	when 'operating-points' property is not found or is invalid data
+ *		in device node.
+ * -ENODATA	when empty 'operating-points' property is found
+ * -EINVAL	when invalid entries are found in opp-v2 table
+ */
+int dev_pm_opp_of_add_table(struct device *dev)
+{
+	struct device_node *opp_np;
+	int ret;
+
+	/*
+	 * OPPs have two version of bindings now. The older one is deprecated,
+	 * try for the new binding first.
+	 */
+	opp_np = dev_pm_opp_of_get_opp_desc_node(dev);
+	if (!opp_np) {
+		/*
+		 * Try old-deprecated bindings for backward compatibility with
+		 * older dtbs.
+		 */
+		return _of_add_opp_table_v1(dev);
+	}
+
+	ret = _of_add_opp_table_v2(dev, opp_np);
+	of_node_put(opp_np);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_add_table);
+
+/* CPU device specific helpers */
+
+/**
+ * dev_pm_opp_of_cpumask_remove_table() - Removes OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be removed
+ *
+ * This removes the OPP tables for CPUs present in the @cpumask.
+ * This should be used only to remove static entries created from DT.
+ */
+void dev_pm_opp_of_cpumask_remove_table(const struct cpumask *cpumask)
+{
+	_dev_pm_opp_cpumask_remove_table(cpumask, true);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_remove_table);
+
+/**
+ * dev_pm_opp_of_cpumask_add_table() - Adds OPP table for @cpumask
+ * @cpumask:	cpumask for which OPP table needs to be added.
+ *
+ * This adds the OPP tables for CPUs present in the @cpumask.
+ */
+int dev_pm_opp_of_cpumask_add_table(const struct cpumask *cpumask)
+{
+	struct device *cpu_dev;
+	int cpu, ret = 0;
+
+	WARN_ON(cpumask_empty(cpumask));
+
+	for_each_cpu(cpu, cpumask) {
+		cpu_dev = get_cpu_device(cpu);
+		if (!cpu_dev) {
+			pr_err("%s: failed to get cpu%d device\n", __func__,
+			       cpu);
+			continue;
+		}
+
+		ret = dev_pm_opp_of_add_table(cpu_dev);
+		if (ret) {
+			/*
+			 * OPP may get registered dynamically, don't print error
+			 * message here.
+			 */
+			pr_debug("%s: couldn't find opp table for cpu:%d, %d\n",
+				 __func__, cpu, ret);
+
+			/* Free all other OPPs */
+			dev_pm_opp_of_cpumask_remove_table(cpumask);
+			break;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_cpumask_add_table);
+
+/*
+ * Works only for OPP v2 bindings.
+ *
+ * Returns -ENOENT if operating-points-v2 bindings aren't supported.
+ */
+/**
+ * dev_pm_opp_of_get_sharing_cpus() - Get cpumask of CPUs sharing OPPs with
+ *				      @cpu_dev using operating-points-v2
+ *				      bindings.
+ *
+ * @cpu_dev:	CPU device for which we do this operation
+ * @cpumask:	cpumask to update with information of sharing CPUs
+ *
+ * This updates the @cpumask with CPUs that are sharing OPPs with @cpu_dev.
+ *
+ * Returns -ENOENT if operating-points-v2 isn't present for @cpu_dev.
+ */
+int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
+				   struct cpumask *cpumask)
+{
+	struct device_node *np, *tmp_np, *cpu_np;
+	int cpu, ret = 0;
+
+	/* Get OPP descriptor node */
+	np = dev_pm_opp_of_get_opp_desc_node(cpu_dev);
+	if (!np) {
+		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
+		return -ENOENT;
+	}
+
+	cpumask_set_cpu(cpu_dev->id, cpumask);
+
+	/* OPPs are shared ? */
+	if (!of_property_read_bool(np, "opp-shared"))
+		goto put_cpu_node;
+
+	for_each_possible_cpu(cpu) {
+		if (cpu == cpu_dev->id)
+			continue;
+
+		cpu_np = of_get_cpu_node(cpu, NULL);
+		if (!cpu_np) {
+			dev_err(cpu_dev, "%s: failed to get cpu%d node\n",
+				__func__, cpu);
+			ret = -ENOENT;
+			goto put_cpu_node;
+		}
+
+		/* Get OPP descriptor node */
+		tmp_np = _opp_of_get_opp_desc_node(cpu_np);
+		if (!tmp_np) {
+			pr_err("%pOF: Couldn't find opp node\n", cpu_np);
+			ret = -ENOENT;
+			goto put_cpu_node;
+		}
+
+		/* CPUs are sharing opp node */
+		if (np == tmp_np)
+			cpumask_set_cpu(cpu, cpumask);
+
+		of_node_put(tmp_np);
+	}
+
+put_cpu_node:
+	of_node_put(np);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_of_get_sharing_cpus);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
new file mode 100644
index 000000000000..166eef990599
--- /dev/null
+++ b/drivers/opp/opp.h
@@ -0,0 +1,222 @@
+/*
+ * Generic OPP Interface
+ *
+ * Copyright (C) 2009-2010 Texas Instruments Incorporated.
+ *	Nishanth Menon
+ *	Romit Dasgupta
+ *	Kevin Hilman
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __DRIVER_OPP_H__
+#define __DRIVER_OPP_H__
+
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/limits.h>
+#include <linux/pm_opp.h>
+#include <linux/notifier.h>
+
+struct clk;
+struct regulator;
+
+/* Lock to allow exclusive modification to the device and opp lists */
+extern struct mutex opp_table_lock;
+
+extern struct list_head opp_tables;
+
+/*
+ * Internal data structure organization with the OPP layer library is as
+ * follows:
+ * opp_tables (root)
+ *	|- device 1 (represents voltage domain 1)
+ *	|	|- opp 1 (availability, freq, voltage)
+ *	|	|- opp 2 ..
+ *	...	...
+ *	|	`- opp n ..
+ *	|- device 2 (represents the next voltage domain)
+ *	...
+ *	`- device m (represents mth voltage domain)
+ * device 1, 2.. are represented by opp_table structure while each opp
+ * is represented by the opp structure.
+ */
+
+/**
+ * struct dev_pm_opp - Generic OPP description structure
+ * @node:	opp table node. The nodes are maintained throughout the lifetime
+ *		of boot. It is expected only an optimal set of OPPs are
+ *		added to the library by the SoC framework.
+ *		IMPORTANT: the opp nodes should be maintained in increasing
+ *		order.
+ * @kref:	for reference count of the OPP.
+ * @available:	true/false - marks if this OPP as available or not
+ * @dynamic:	not-created from static DT entries.
+ * @turbo:	true if turbo (boost) OPP
+ * @suspend:	true if suspend OPP
+ * @rate:	Frequency in hertz
+ * @supplies:	Power supplies voltage/current values
+ * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
+ *		frequency from any other OPP's frequency.
+ * @opp_table:	points back to the opp_table struct this opp belongs to
+ * @np:		OPP's device node.
+ * @dentry:	debugfs dentry pointer (per opp)
+ *
+ * This structure stores the OPP information for a given device.
+ */
+struct dev_pm_opp {
+	struct list_head node;
+	struct kref kref;
+
+	bool available;
+	bool dynamic;
+	bool turbo;
+	bool suspend;
+	unsigned long rate;
+
+	struct dev_pm_opp_supply *supplies;
+
+	unsigned long clock_latency_ns;
+
+	struct opp_table *opp_table;
+
+	struct device_node *np;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+/**
+ * struct opp_device - devices managed by 'struct opp_table'
+ * @node:	list node
+ * @dev:	device to which the struct object belongs
+ * @dentry:	debugfs dentry pointer (per device)
+ *
+ * This is an internal data structure maintaining the devices that are managed
+ * by 'struct opp_table'.
+ */
+struct opp_device {
+	struct list_head node;
+	const struct device *dev;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+#endif
+};
+
+enum opp_table_access {
+	OPP_TABLE_ACCESS_UNKNOWN = 0,
+	OPP_TABLE_ACCESS_EXCLUSIVE = 1,
+	OPP_TABLE_ACCESS_SHARED = 2,
+};
+
+/**
+ * struct opp_table - Device opp structure
+ * @node:	table node - contains the devices with OPPs that
+ *		have been registered. Nodes once added are not modified in this
+ *		table.
+ * @head:	notifier head to notify the OPP availability changes.
+ * @dev_list:	list of devices that share these OPPs
+ * @opp_list:	table of opps
+ * @kref:	for reference count of the table.
+ * @lock:	mutex protecting the opp_list.
+ * @np:		struct device_node pointer for opp's DT node.
+ * @clock_latency_ns_max: Max clock latency in nanoseconds.
+ * @shared_opp: OPP is shared between multiple devices.
+ * @suspend_opp: Pointer to OPP to be used during device suspend.
+ * @supported_hw: Array of version number to support.
+ * @supported_hw_count: Number of elements in supported_hw array.
+ * @prop_name: A name to postfix to many DT properties, while parsing them.
+ * @clk: Device's clock handle
+ * @regulators: Supply regulators
+ * @regulator_count: Number of power supply regulators
+ * @set_opp: Platform specific set_opp callback
+ * @set_opp_data: Data to be passed to set_opp callback
+ * @dentry:	debugfs dentry pointer of the real device directory (not links).
+ * @dentry_name: Name of the real dentry.
+ *
+ * @voltage_tolerance_v1: In percentage, for v1 bindings only.
+ *
+ * This is an internal data structure maintaining the link to opps attached to
+ * a device. This structure is not meant to be shared to users as it is
+ * meant for book keeping and private to OPP library.
+ */
+struct opp_table {
+	struct list_head node;
+
+	struct blocking_notifier_head head;
+	struct list_head dev_list;
+	struct list_head opp_list;
+	struct kref kref;
+	struct mutex lock;
+
+	struct device_node *np;
+	unsigned long clock_latency_ns_max;
+
+	/* For backward compatibility with v1 bindings */
+	unsigned int voltage_tolerance_v1;
+
+	enum opp_table_access shared_opp;
+	struct dev_pm_opp *suspend_opp;
+
+	unsigned int *supported_hw;
+	unsigned int supported_hw_count;
+	const char *prop_name;
+	struct clk *clk;
+	struct regulator **regulators;
+	unsigned int regulator_count;
+
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
+	struct dev_pm_set_opp_data *set_opp_data;
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *dentry;
+	char dentry_name[NAME_MAX];
+#endif
+};
+
+/* Routines internal to opp core */
+void _get_opp_table_kref(struct opp_table *opp_table);
+struct opp_table *_find_opp_table(struct device *dev);
+struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
+void _dev_pm_opp_remove_table(struct opp_table *opp_table, struct device *dev, bool remove_all);
+void _dev_pm_opp_find_and_remove_table(struct device *dev, bool remove_all);
+struct dev_pm_opp *_opp_allocate(struct opp_table *opp_table);
+void _opp_free(struct dev_pm_opp *opp);
+int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
+int _opp_add_v1(struct opp_table *opp_table, struct device *dev, unsigned long freq, long u_volt, bool dynamic);
+void _dev_pm_opp_cpumask_remove_table(const struct cpumask *cpumask, bool of);
+struct opp_table *_add_opp_table(struct device *dev);
+
+#ifdef CONFIG_OF
+void _of_init_opp_table(struct opp_table *opp_table, struct device *dev);
+#else
+static inline void _of_init_opp_table(struct opp_table *opp_table, struct device *dev) {}
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void opp_debug_remove_one(struct dev_pm_opp *opp);
+int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table);
+int opp_debug_register(struct opp_device *opp_dev, struct opp_table *opp_table);
+void opp_debug_unregister(struct opp_device *opp_dev, struct opp_table *opp_table);
+#else
+static inline void opp_debug_remove_one(struct dev_pm_opp *opp) {}
+
+static inline int opp_debug_create_one(struct dev_pm_opp *opp,
+				       struct opp_table *opp_table)
+{ return 0; }
+static inline int opp_debug_register(struct opp_device *opp_dev,
+				     struct opp_table *opp_table)
+{ return 0; }
+
+static inline void opp_debug_unregister(struct opp_device *opp_dev,
+					struct opp_table *opp_table)
+{ }
+#endif		/* DEBUG_FS */
+
+#endif		/* __DRIVER_OPP_H__ */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 
-config PM_OPP
-	bool
-	select SRCU
-	---help---
-	  SOCs have a standard set of tuples consisting of frequency and
-	  voltage pairs that the device will support per voltage domain. This
-	  is called Operating Performance Point or OPP. The actual definitions
-	  of OPP varies over silicon within the same family of devices.
-
-	  OPP layer organizes the data internally using device pointers
-	  representing individual voltage domains and provides SOC
-	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
-
 config PM_CLK
 	def_bool y
 	depends on PM && HAVE_CLK
-- 
cgit v1.3-14-g43fede


From 64ec72a1ece37d9bc7ba8b11d6091ce7cb1d8eec Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 27 Sep 2017 22:01:34 -0700
Subject: PM: Use a more common logging style

Convert printks to pr_<level>.

Miscellanea:

o Use pr_fmt with "PM:" and remove "PM: " from format strings
o Coalesce format strings and realign format arguments
o Convert an embedded incorrect function name to "%s: ", __func__
o Convert a couple multi-line formats to multiple pr_<level> calls

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c      |   4 +-
 kernel/power/snapshot.c |  35 ++++++-------
 kernel/power/swap.c     | 128 +++++++++++++++++++++---------------------------
 3 files changed, 77 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
 	for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
 		ret = register_pm_qos_misc(pm_qos_array[i], d);
 		if (ret < 0) {
-			printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-			       pm_qos_array[i]->name);
+			pr_err("%s: %s setup failed\n",
+			       __func__, pm_qos_array[i]->name);
 			return ret;
 		}
 	}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..a917a301e201 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
  Report:
-	printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+	pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
 		(unsigned long long) start_pfn << PAGE_SHIFT,
 		((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 }
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 	list_for_each_entry(region, &nosave_regions, list) {
 		unsigned long pfn;
 
-		pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+		pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
 			 (unsigned long long) region->start_pfn << PAGE_SHIFT,
 			 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
 				- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
 	free_pages_map = bm2;
 	mark_nosave_pages(forbidden_pages_map);
 
-	pr_debug("PM: Basic memory bitmaps created\n");
+	pr_debug("Basic memory bitmaps created\n");
 
 	return 0;
 
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
 	memory_bm_free(bm2, PG_UNSAFE_CLEAR);
 	kfree(bm2);
 
-	pr_debug("PM: Basic memory bitmaps freed\n");
+	pr_debug("Basic memory bitmaps freed\n");
 }
 
 void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
 		pfn = memory_bm_next_pfn(bm);
 	}
 	memory_bm_position_reset(bm);
-	pr_info("PM: free pages cleared after restore\n");
+	pr_info("free pages cleared after restore\n");
 #endif /* PAGE_POISONING_ZERO */
 }
 
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
 	ktime_t start, stop;
 	int error;
 
-	printk(KERN_INFO "PM: Preallocating image memory... ");
+	pr_info("Preallocating image memory... ");
 	start = ktime_get();
 
 	error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
 
  out:
 	stop = ktime_get();
-	printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+	pr_cont("done (allocated %lu pages)\n", pages);
 	swsusp_show_speed(start, stop, pages, "Allocated");
 
 	return 0;
 
  err_out:
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	swsusp_free();
 	return -ENOMEM;
 }
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 			free += zone_page_state(zone, NR_FREE_PAGES);
 
 	nr_pages += count_pages_for_highmem(nr_highmem);
-	pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
-		nr_pages, PAGES_FOR_IO, free);
+	pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+		 nr_pages, PAGES_FOR_IO, free);
 
 	return free > nr_pages + PAGES_FOR_IO;
 }
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-	printk(KERN_INFO "PM: Creating hibernation image:\n");
+	pr_info("Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
 	nr_highmem = count_highmem_pages();
-	printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+	pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
 
 	if (!enough_free_mem(nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Not enough free memory\n");
+		pr_err("Not enough free memory\n");
 		return -ENOMEM;
 	}
 
 	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Memory allocation failed\n");
+		pr_err("Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
 	nr_copy_pages = nr_pages;
 	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
-	printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
-		nr_pages);
+	pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
 
 	return 0;
 }
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
 	if (!reason && info->num_physpages != get_num_physpages())
 		reason = "memory size";
 	if (reason) {
-		printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+		pr_err("Image mismatch: %s\n", reason);
 		return -EPERM;
 	}
 	return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7cdc426ee38..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/delay.h>
@@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio)
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
 	if (bio->bi_status) {
-		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
-				MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
-				(unsigned long long)bio->bi_iter.bi_sector);
+		pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+			 MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+			 (unsigned long long)bio->bi_iter.bi_sector);
 	}
 
 	if (bio_data_dir(bio) == WRITE)
@@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)bio->bi_iter.bi_sector);
+		pr_err("Adding page to bio failed at %llu\n",
+		       (unsigned long long)bio->bi_iter.bi_sector);
 		bio_put(bio);
 		return -EFAULT;
 	}
@@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Swap header not found!\n");
+		pr_err("Swap header not found!\n");
 		error = -ENODEV;
 	}
 	return error;
@@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	ret = swsusp_swap_check();
 	if (ret) {
 		if (ret != -ENOSPC)
-			printk(KERN_ERR "PM: Cannot find swap device, try "
-					"swapon -a.\n");
+			pr_err("Cannot find swap device, try swapon -a\n");
 		return ret;
 	}
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 {
 	if (!error) {
 		flush_swap_writer(handle);
-		printk(KERN_INFO "PM: S");
+		pr_info("S");
 		error = mark_swapfiles(handle, flags);
-		printk("|\n");
+		pr_cont("|\n");
 	}
 
 	if (error)
@@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
-	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+	pr_info("Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
@@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image saving progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 	return ret;
 }
@@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 		                            "image_compress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start compression threads\n");
+			pr_err("Cannot start compression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	 */
 	handle->reqd_free_pages = reqd_free_pages();
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for compression.\n"
-		"PM: Compressing and saving image data (%u pages)...\n",
-		nr_threads, nr_to_write);
+	pr_info("Using %u thread(s) for compression\n", nr_threads);
+	pr_info("Compressing and saving image data (%u pages)...\n",
+		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
@@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 				       data_of(*snapshot), PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image saving progress: "
-					       "%3d%%\n",
-				               nr_pages / m * 10);
+					pr_info("Image saving progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 			}
 			if (!off)
@@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR "PM: LZO compression failed\n");
+				pr_err("LZO compression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(data[thr].unc_len))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -857,7 +853,7 @@ out_finish:
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 out_clean:
 	if (crc) {
@@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 	unsigned int required;
 
-	pr_debug("PM: Free swap pages: %u\n", free_swap);
+	pr_debug("Free swap pages: %u\n", free_swap);
 
 	required = PAGES_FOR_IO + nr_pages;
 	return free_swap > required;
@@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags)
 	pages = snapshot_get_image_size();
 	error = get_swap_writer(&handle);
 	if (error) {
-		printk(KERN_ERR "PM: Cannot get swap writer\n");
+		pr_err("Cannot get swap writer\n");
 		return error;
 	}
 	if (flags & SF_NOCOMPRESS_MODE) {
 		if (!enough_swap(pages, flags)) {
-			printk(KERN_ERR "PM: Not enough free swap\n");
+			pr_err("Not enough free swap\n");
 			error = -ENOSPC;
 			goto out_finish;
 		}
@@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
 	hib_init_batch(&hb);
 
 	clean_pages_on_read = true;
-	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
-		nr_to_read);
+	pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image loading progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
@@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		                            "image_decompress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start decompression threads\n");
+			pr_err("Cannot start decompression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		if (!page[i]) {
 			if (i < LZO_CMP_PAGES) {
 				ring_size = i;
-				printk(KERN_ERR
-				       "PM: Failed to allocate LZO pages\n");
+				pr_err("Failed to allocate LZO pages\n");
 				ret = -ENOMEM;
 				goto out_clean;
 			} else {
@@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	want = ring_size = i;
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for decompression.\n"
-		"PM: Loading and decompressing image data (%u pages)...\n",
-		nr_threads, nr_to_read);
+	pr_info("Using %u thread(s) for decompression\n", nr_threads);
+	pr_info("Loading and decompressing image data (%u pages)...\n",
+		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(LZO_UNC_SIZE))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR
-				       "PM: LZO decompression failed\n");
+				pr_err("LZO decompression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].unc_len ||
 			             data[thr].unc_len > LZO_UNC_SIZE ||
 			             data[thr].unc_len & (PAGE_SIZE - 1))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO uncompressed length\n");
+				pr_err("Invalid LZO uncompressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 				       data[thr].unc + off, PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image loading progress: "
-					       "%3d%%\n",
-					       nr_pages / m * 10);
+					pr_info("Image loading progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 
 				ret = snapshot_write_next(snapshot);
@@ -1448,15 +1435,14 @@ out_finish:
 	}
 	stop = ktime_get();
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
 		if (!ret) {
 			if (swsusp_header->flags & SF_CRC32_MODE) {
 				if(handle->crc32 != swsusp_header->crc32) {
-					printk(KERN_ERR
-					       "PM: Invalid image CRC32!\n");
+					pr_err("Invalid image CRC32!\n");
 					ret = -ENODATA;
 				}
 			}
@@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
 	swap_reader_finish(&handle);
 end:
 	if (!error)
-		pr_debug("PM: Image successfully loaded\n");
+		pr_debug("Image successfully loaded\n");
 	else
-		pr_debug("PM: Error %d resuming\n", error);
+		pr_debug("Error %d resuming\n", error);
 	return error;
 }
 
@@ -1552,13 +1538,13 @@ put:
 		if (error)
 			blkdev_put(hib_resume_bdev, FMODE_READ);
 		else
-			pr_debug("PM: Image signature found, resuming\n");
+			pr_debug("Image signature found, resuming\n");
 	} else {
 		error = PTR_ERR(hib_resume_bdev);
 	}
 
 	if (error)
-		pr_debug("PM: Image not found (code %d)\n", error);
+		pr_debug("Image not found (code %d)\n", error);
 
 	return error;
 }
@@ -1570,7 +1556,7 @@ put:
 void swsusp_close(fmode_t mode)
 {
 	if (IS_ERR(hib_resume_bdev)) {
-		pr_debug("PM: Image device not initialised\n");
+		pr_debug("Image device not initialised\n");
 		return;
 	}
 
@@ -1594,7 +1580,7 @@ int swsusp_unmark(void)
 					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+		pr_err("Cannot find swsusp signature!\n");
 		error = -ENODEV;
 	}
 
-- 
cgit v1.3-14-g43fede


From d8c4deee6dc6876ae3b7d09a5b58138a57ee45f6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Fri, 8 Sep 2017 23:55:17 -0700
Subject: tracing: Remove obsolete sched_switch tracer selftest

Since commit 87d80de2800d087ea833cb79bc13f85ff34ed49f ("tracing: Remove
obsolete sched_switch tracer"), the sched_switch tracer selftest is no longer
used.  This patch removes the same.

Link: http://lkml.kernel.org/r/20170909065517.22262-1-joelaf@google.com

Cc: Ingo Molnar <mingo@redhat.com>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.h          |  2 --
 kernel/trace/trace_selftest.c | 32 --------------------------------
 2 files changed, 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 652c682707cd..3c078e2ad777 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -738,8 +738,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_nop(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
-					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 /*
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b17ec642793b..364f78abdf47 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1150,38 +1150,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_SCHED_TRACER */
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/* Sleep for a 1/10 of a second */
-	msleep(100);
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(&tr->trace_buffer, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
-- 
cgit v1.3-14-g43fede


From 6e7a2398114a2597a0995f96f44d908741ae5035 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 23 Aug 2017 12:23:09 +0100
Subject: tracing: Remove redundant unread variable ret

Integer ret is being assigned but never used and hence it is
redundant. Remove it, fixes clang warning:

trace_events_hist.c:1077:3: warning: Value stored to 'ret' is never read

Link: http://lkml.kernel.org/r/20170823112309.19383-1-colin.king@canonical.com

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..f123b5d0c226 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -1062,7 +1062,7 @@ static void hist_trigger_show(struct seq_file *m,
 			      struct event_trigger_data *data, int n)
 {
 	struct hist_trigger_data *hist_data;
-	int n_entries, ret = 0;
+	int n_entries;
 
 	if (n > 0)
 		seq_puts(m, "\n\n");
@@ -1073,10 +1073,8 @@ static void hist_trigger_show(struct seq_file *m,
 
 	hist_data = data->private_data;
 	n_entries = print_entries(m, hist_data);
-	if (n_entries < 0) {
-		ret = n_entries;
+	if (n_entries < 0)
 		n_entries = 0;
-	}
 
 	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
 		   (u64)atomic64_read(&hist_data->map->hits),
-- 
cgit v1.3-14-g43fede


From 12ecef0cb12102d8c034770173d2d1363cb97d52 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 21 Sep 2017 16:22:49 -0400
Subject: tracing: Reverse the order of trace_types_lock and event_mutex

In order to make future changes where we need to call
tracing_set_clock() from within an event command, the order of
trace_types_lock and event_mutex must be reversed, as the event command
will hold event_mutex and the trace_types_lock is taken from within
tracing_set_clock().

Link: http://lkml.kernel.org/r/20170921162249.0dde3dca@gandalf.local.home

Requested-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        |  5 +++++
 kernel/trace/trace_events.c | 31 +++++++++++++++----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..5f1ac7d3402c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name)
 	struct trace_array *tr;
 	int ret;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name)
 	list_add(&tr->list, &ftrace_trace_arrays);
 
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name)
 	int ret;
 	int i;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..ec0f9aa4e151 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		return -ENODEV;
 
 	/* Make sure the system still exists */
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		list_for_each_entry(dir, &tr->systems, list) {
 			if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		}
 	}
  exit_loop:
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	if (!system)
 		return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
 int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 
 	ret = __register_event(call, NULL);
 	if (ret >= 0)
 		__add_event_to_tracers(call);
 
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 	return ret;
 }
 
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	down_write(&trace_event_sem);
 	ret = probe_remove_event_call(call);
 	up_write(&trace_event_sem);
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self,
 {
 	struct module *mod = data;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	switch (val) {
 	case MODULE_STATE_COMING:
 		trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self,
 		trace_module_remove_events(mod);
 		break;
 	}
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
  * creates the event hierachry in the @parent/events directory.
  *
  * Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
  */
 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 {
 	int ret;
 
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	ret = create_event_toplevel_files(parent, tr);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	down_write(&trace_event_sem);
 	__trace_add_event_dirs(tr);
 	up_write(&trace_event_sem);
 
- out_unlock:
-	mutex_unlock(&event_mutex);
-
+ out:
 	return ret;
 }
 
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
 	return ret;
 }
 
+/* Must be called with event_mutex held */
 int event_trace_del_tracer(struct trace_array *tr)
 {
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	/* Disable any event triggers and associated soft-disabled events */
 	clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	tr->event_dir = NULL;
 
-	mutex_unlock(&event_mutex);
-
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 1a149d7d3f45d311da1f63473736c05f30ae8a75 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 22 Sep 2017 16:59:02 -0400
Subject: ring-buffer: Rewrite trace_recursive_(un)lock() to be simpler

The current method to prevent the ring buffer from entering into a recursize
loop is to use a bitmask and set the bit that maps to the current context
(normal, softirq, irq or NMI), and if that bit was already set, it is
considered a recursive loop.

New code is being added that may require the ring buffer to be entered a
second time in the current context. The recursive locking prevents that from
happening. Instead of mapping a bitmask to the current context, just allow 4
levels of nesting in the ring buffer. This matches the 4 context levels that
it can already nest. It is highly unlikely to have more than two levels,
thus it should be fine when we add the second entry into the ring buffer. If
that proves to be a problem, we can always up the number to 8.

An added benefit is that reading preempt_count() to get the current level
adds a very slight but noticeable overhead. This removes that need.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 64 ++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..f6ee9b1ef62a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2538,61 +2538,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * The lock and unlock are done within a preempt disable section.
  * The current_context per_cpu variable can only be modified
  * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
  *
- *  bit 0 =  NMI context
- *  bit 1 =  IRQ context
- *  bit 2 =  SoftIRQ context
- *  bit 3 =  normal context.
+ *  Normal context.
+ *  SoftIRQ context
+ *  IRQ context
+ *  NMI context
  *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- *  101 - 1 = 100
- *  101 & 100 = 100 (clearing bit zero)
- *
- *  1010 - 1 = 1001
- *  1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
  */
 
 static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	unsigned int val = cpu_buffer->current_context;
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = RB_CTX_NMI;
-		else if (in_irq())
-			bit = RB_CTX_IRQ;
-		else
-			bit = RB_CTX_SOFTIRQ;
-	} else
-		bit = RB_CTX_NORMAL;
-
-	if (unlikely(val & (1 << bit)))
+	if (cpu_buffer->current_context >= 4)
 		return 1;
 
-	val |= (1 << bit);
-	cpu_buffer->current_context = val;
+	cpu_buffer->current_context++;
+	/* Interrupts must see this update */
+	barrier();
 
 	return 0;
 }
@@ -2600,7 +2568,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+	/* Don't let the dec leak out */
+	barrier();
+	cpu_buffer->current_context--;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From a15f7fc20389a8827d5859907568b201234d4b79 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:17 -0500
Subject: tracing: Exclude 'generic fields' from histograms

There are a small number of 'generic fields' (comm/COMM/cpu/CPU) that
are found by trace_find_event_field() but are only meant for
filtering.  Specifically, they unlike normal fields, they have a size
of 0 and thus wreak havoc when used as a histogram key.

Exclude these (return -EINVAL) when used as histogram keys.

Link: http://lkml.kernel.org/r/956154cbc3e8a4f0633d619b886c97f0f0edf7b4.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f123b5d0c226..121d56850f24 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -450,7 +450,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 	}
 
 	field = trace_find_event_field(file->event_call, field_name);
-	if (!field) {
+	if (!field || !field->size) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -548,7 +548,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		}
 
 		field = trace_find_event_field(file->event_call, field_name);
-		if (!field) {
+		if (!field || !field->size) {
 			ret = -EINVAL;
 			goto out;
 		}
-- 
cgit v1.3-14-g43fede


From 83c07ecc4203728e85fc4a2ce6fdf25d16ea118e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:18 -0500
Subject: tracing: Remove lookups from tracing_map hitcount

Lookups inflate the hitcount, making it essentially useless.  Only
inserts and updates should really affect the hitcount anyway, so
explicitly filter lookups out.

Link: http://lkml.kernel.org/r/c8d9dc39d269a8abf88bf4102d0dfc65deb0fc7f.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 
 		if (test_key && test_key == key_hash && entry->val &&
 		    keys_match(key, entry->val->key, map->key_size)) {
-			atomic64_inc(&map->hits);
+			if (!lookup_only)
+				atomic64_inc(&map->hits);
 			return entry->val;
 		}
 
-- 
cgit v1.3-14-g43fede


From 4f36c2d85cedea60ad424d44534121ab0458069e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:19 -0500
Subject: tracing: Increase tracing map KEYS_MAX size

The current default for the number of subkeys in a compound key is 2,
which is too restrictive.  Increase it to a more realistic value of 3.

Link: http://lkml.kernel.org/r/b6952cca06d1f912eba33804a6fd6069b3847d44.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/tracing_map.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 618838f5f30a..f0975110b967 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -5,7 +5,7 @@
 #define TRACING_MAP_BITS_MAX		17
 #define TRACING_MAP_BITS_MIN		7
 
-#define TRACING_MAP_KEYS_MAX		2
+#define TRACING_MAP_KEYS_MAX		3
 #define TRACING_MAP_VALS_MAX		3
 #define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
 					 TRACING_MAP_VALS_MAX)
-- 
cgit v1.3-14-g43fede


From 7e465baa80293ed5f87fdf6405391d6f02110d4e Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:20 -0500
Subject: tracing: Make traceprobe parsing code reusable

traceprobe_probes_write() and traceprobe_command() actually contain
nothing that ties them to kprobes - the code is generically useful for
similar types of parsing elsewhere, so separate it out and move it to
trace.c/trace.h.

Other than moving it, the only change is in naming:
traceprobe_probes_write() becomes trace_parse_run_command() and
traceprobe_command() becomes trace_run_command().

Link: http://lkml.kernel.org/r/ae5c26ea40c196a8986854d921eb6e713ede7e3f.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 86 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        |  7 ++++
 kernel/trace/trace_kprobe.c | 18 +++++-----
 kernel/trace/trace_probe.c  | 86 ---------------------------------------------
 kernel/trace/trace_probe.h  |  7 ----
 kernel/trace/trace_uprobe.c |  2 +-
 6 files changed, 103 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5f1ac7d3402c..73e67b68c53b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -8281,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 }
 EXPORT_SYMBOL_GPL(ftrace_dump);
 
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+	char **argv;
+	int argc, ret;
+
+	argc = 0;
+	ret = 0;
+	argv = argv_split(GFP_KERNEL, buf, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (argc)
+		ret = createfn(argc, argv);
+
+	argv_free(argv);
+
+	return ret;
+}
+
+#define WRITE_BUFSIZE  4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos,
+				int (*createfn)(int, char **))
+{
+	char *kbuf, *buf, *tmp;
+	int ret = 0;
+	size_t done = 0;
+	size_t size;
+
+	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	while (done < count) {
+		size = count - done;
+
+		if (size >= WRITE_BUFSIZE)
+			size = WRITE_BUFSIZE - 1;
+
+		if (copy_from_user(kbuf, buffer + done, size)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kbuf[size] = '\0';
+		buf = kbuf;
+		do {
+			tmp = strchr(buf, '\n');
+			if (tmp) {
+				*tmp = '\0';
+				size = tmp - buf + 1;
+			} else {
+				size = strlen(buf);
+				if (done + size < count) {
+					if (buf != kbuf)
+						break;
+					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+					pr_warn("Line length is too long: Should be less than %d\n",
+						WRITE_BUFSIZE - 2);
+					ret = -EINVAL;
+					goto out;
+				}
+			}
+			done += size;
+
+			/* Remove comments */
+			tmp = strchr(buf, '#');
+
+			if (tmp)
+				*tmp = '\0';
+
+			ret = trace_run_command(buf, createfn);
+			if (ret)
+				goto out;
+			buf += size;
+
+		} while (done < count);
+	}
+	ret = done;
+
+out:
+	kfree(kbuf);
+
+	return ret;
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3c078e2ad777..f8343eb3c1f9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1752,6 +1752,13 @@ void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
+#define MAX_EVENT_NAME_LEN	64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos,
+		int (*createfn)(int, char**));
+
 /*
  * Normal trace_printk() and friends allocates special buffers
  * to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..af6134f2e597 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos,
-			create_trace_kprobe);
+	return trace_parse_run_command(file, buffer, count, ppos,
+				       create_trace_kprobe);
 }
 
 static const struct file_operations kprobe_events_ops = {
@@ -1433,9 +1433,9 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	pr_info("Testing kprobe tracing: ");
 
-	ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
-				  "$stack $stack0 +0($stack)",
-				  create_trace_kprobe);
+	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+				"$stack $stack0 +0($stack)",
+				create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function entry.\n");
 		warn++;
@@ -1455,8 +1455,8 @@ static __init int kprobe_trace_self_tests_init(void)
 		}
 	}
 
-	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
-				  "$retval", create_trace_kprobe);
+	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+				"$retval", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function return.\n");
 		warn++;
@@ -1526,13 +1526,13 @@ static __init int kprobe_trace_self_tests_init(void)
 			disable_trace_kprobe(tk, file);
 	}
 
-	ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-	ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe2", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
 	kfree(arg->comm);
 }
 
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
-	char **argv;
-	int argc, ret;
-
-	argc = 0;
-	ret = 0;
-	argv = argv_split(GFP_KERNEL, buf, &argc);
-	if (!argv)
-		return -ENOMEM;
-
-	if (argc)
-		ret = createfn(argc, argv);
-
-	argv_free(argv);
-
-	return ret;
-}
-
-#define WRITE_BUFSIZE  4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos,
-				int (*createfn)(int, char **))
-{
-	char *kbuf, *buf, *tmp;
-	int ret = 0;
-	size_t done = 0;
-	size_t size;
-
-	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
-	if (!kbuf)
-		return -ENOMEM;
-
-	while (done < count) {
-		size = count - done;
-
-		if (size >= WRITE_BUFSIZE)
-			size = WRITE_BUFSIZE - 1;
-
-		if (copy_from_user(kbuf, buffer + done, size)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		kbuf[size] = '\0';
-		buf = kbuf;
-		do {
-			tmp = strchr(buf, '\n');
-			if (tmp) {
-				*tmp = '\0';
-				size = tmp - buf + 1;
-			} else {
-				size = strlen(buf);
-				if (done + size < count) {
-					if (buf != kbuf)
-						break;
-					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
-					pr_warn("Line length is too long: Should be less than %d\n",
-						WRITE_BUFSIZE - 2);
-					ret = -EINVAL;
-					goto out;
-				}
-			}
-			done += size;
-
-			/* Remove comments */
-			tmp = strchr(buf, '#');
-
-			if (tmp)
-				*tmp = '\0';
-
-			ret = traceprobe_command(buf, createfn);
-			if (ret)
-				goto out;
-			buf += size;
-
-		} while (done < count);
-	}
-	ret = done;
-
-out:
-	kfree(kbuf);
-
-	return ret;
-}
-
 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
 			   bool is_return)
 {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
 
 #define MAX_TRACE_ARGS		128
 #define MAX_ARGSTR_LEN		63
-#define MAX_EVENT_NAME_LEN	64
 #define MAX_STRING_SIZE		PATH_MAX
 
 /* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
 extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
 
-extern ssize_t traceprobe_probes_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos,
-		int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
 /* Sum up total data length for dynamic arraies (strings) */
 static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..b34965e62fb9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+	return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
 }
 
 static const struct file_operations uprobe_events_ops = {
-- 
cgit v1.3-14-g43fede


From 0d7a8325bf3326c92da2d21b4496a9ddde896d4f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:21 -0500
Subject: tracing: Clean up hist_field_flags enum

As we add more flags, specifying explicit integers for the flag values
becomes more unwieldy and error-prone - switch them over to left-shift
values.

Link: http://lkml.kernel.org/r/e644e4fb7665aec015f4a2d84a2f990d3dd5b8a1.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 121d56850f24..0c7ec3048624 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -110,16 +110,16 @@ DEFINE_HIST_FIELD_FN(u8);
 #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
 
 enum hist_field_flags {
-	HIST_FIELD_FL_HITCOUNT		= 1,
-	HIST_FIELD_FL_KEY		= 2,
-	HIST_FIELD_FL_STRING		= 4,
-	HIST_FIELD_FL_HEX		= 8,
-	HIST_FIELD_FL_SYM		= 16,
-	HIST_FIELD_FL_SYM_OFFSET	= 32,
-	HIST_FIELD_FL_EXECNAME		= 64,
-	HIST_FIELD_FL_SYSCALL		= 128,
-	HIST_FIELD_FL_STACKTRACE	= 256,
-	HIST_FIELD_FL_LOG2		= 512,
+	HIST_FIELD_FL_HITCOUNT		= 1 << 0,
+	HIST_FIELD_FL_KEY		= 1 << 1,
+	HIST_FIELD_FL_STRING		= 1 << 2,
+	HIST_FIELD_FL_HEX		= 1 << 3,
+	HIST_FIELD_FL_SYM		= 1 << 4,
+	HIST_FIELD_FL_SYM_OFFSET	= 1 << 5,
+	HIST_FIELD_FL_EXECNAME		= 1 << 6,
+	HIST_FIELD_FL_SYSCALL		= 1 << 7,
+	HIST_FIELD_FL_STACKTRACE	= 1 << 8,
+	HIST_FIELD_FL_LOG2		= 1 << 9,
 };
 
 struct hist_trigger_attrs {
-- 
cgit v1.3-14-g43fede


From 85013256cf01629f72a327674c5d007b4a4b40da Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:22 -0500
Subject: tracing: Add hist_field_name() accessor

In preparation for hist_fields that won't be strictly based on
trace_event_fields, add a new hist_field_name() accessor to allow that
flexibility and update associated users.

Link: http://lkml.kernel.org/r/5b5a2d36dde067cbbe2434b10f06daac27b7dbd5.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 67 +++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0c7ec3048624..34edf5fd85fd 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -146,6 +146,23 @@ struct hist_trigger_data {
 	struct tracing_map		*map;
 };
 
+static const char *hist_field_name(struct hist_field *field,
+				   unsigned int level)
+{
+	const char *field_name = "";
+
+	if (level > 1)
+		return field_name;
+
+	if (field->field)
+		field_name = field->field->name;
+
+	if (field_name == NULL)
+		field_name = "";
+
+	return field_name;
+}
+
 static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
 {
 	hist_field_fn_t fn = NULL;
@@ -653,7 +670,6 @@ static int is_descending(const char *str)
 static int create_sort_keys(struct hist_trigger_data *hist_data)
 {
 	char *fields_str = hist_data->attrs->sort_key_str;
-	struct ftrace_event_field *field = NULL;
 	struct tracing_map_sort_key *sort_key;
 	int descending, ret = 0;
 	unsigned int i, j;
@@ -670,7 +686,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 	}
 
 	for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+		struct hist_field *hist_field;
 		char *field_str, *field_name;
+		const char *test_name;
 
 		sort_key = &hist_data->sort_keys[i];
 
@@ -703,8 +721,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 		}
 
 		for (j = 1; j < hist_data->n_fields; j++) {
-			field = hist_data->fields[j]->field;
-			if (field && (strcmp(field_name, field->name) == 0)) {
+			hist_field = hist_data->fields[j];
+			test_name = hist_field_name(hist_field, 0);
+
+			if (strcmp(field_name, test_name) == 0) {
 				sort_key->field_idx = j;
 				descending = is_descending(field_str);
 				if (descending < 0) {
@@ -952,6 +972,7 @@ hist_trigger_entry_print(struct seq_file *m,
 	struct hist_field *key_field;
 	char str[KSYM_SYMBOL_LEN];
 	bool multiline = false;
+	const char *field_name;
 	unsigned int i;
 	u64 uval;
 
@@ -963,26 +984,27 @@ hist_trigger_entry_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ", ");
 
+		field_name = hist_field_name(key_field, 0);
+
 		if (key_field->flags & HIST_FIELD_FL_HEX) {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %llx",
-				   key_field->field->name, uval);
+			seq_printf(m, "%s: %llx", field_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol_no_offset(str, uval);
-			seq_printf(m, "%s: [%llx] %-45s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-45s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol(str, uval);
-			seq_printf(m, "%s: [%llx] %-55s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-55s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
 			char *comm = elt->private_data;
 
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %-16s[%10llu]",
-				   key_field->field->name, comm, uval);
+			seq_printf(m, "%s: %-16s[%10llu]", field_name,
+				   comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
 			const char *syscall_name;
 
@@ -991,8 +1013,8 @@ hist_trigger_entry_print(struct seq_file *m,
 			if (!syscall_name)
 				syscall_name = "unknown_syscall";
 
-			seq_printf(m, "%s: %-30s[%3llu]",
-				   key_field->field->name, syscall_name, uval);
+			seq_printf(m, "%s: %-30s[%3llu]", field_name,
+				   syscall_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
 			seq_puts(m, "stacktrace:\n");
 			hist_trigger_stacktrace_print(m,
@@ -1000,15 +1022,14 @@ hist_trigger_entry_print(struct seq_file *m,
 						      HIST_STACKTRACE_DEPTH);
 			multiline = true;
 		} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
-			seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+			seq_printf(m, "%s: ~ 2^%-2llu", field_name,
 				   *(u64 *)(key + key_field->offset));
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
-			seq_printf(m, "%s: %-50s", key_field->field->name,
+			seq_printf(m, "%s: %-50s", field_name,
 				   (char *)(key + key_field->offset));
 		} else {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %10llu", key_field->field->name,
-				   uval);
+			seq_printf(m, "%s: %10llu", field_name, uval);
 		}
 	}
 
@@ -1021,13 +1042,13 @@ hist_trigger_entry_print(struct seq_file *m,
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
 	for (i = 1; i < hist_data->n_vals; i++) {
+		field_name = hist_field_name(hist_data->fields[i], 0);
+
 		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
-			seq_printf(m, "  %s: %10llx",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llx", field_name,
 				   tracing_map_read_sum(elt, i));
 		} else {
-			seq_printf(m, "  %s: %10llu",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llu", field_name,
 				   tracing_map_read_sum(elt, i));
 		}
 	}
@@ -1140,7 +1161,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
-	seq_printf(m, "%s", hist_field->field->name);
+	const char *field_name = hist_field_name(hist_field, 0);
+
+	seq_printf(m, "%s", field_name);
 	if (hist_field->flags) {
 		const char *flags_str = get_hist_field_flags(hist_field);
 
-- 
cgit v1.3-14-g43fede


From 5819eaddf35b24d628ddfa4fbb5f8d4026e44b96 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tom.zanussi@linux.intel.com>
Date: Fri, 22 Sep 2017 14:58:23 -0500
Subject: tracing: Reimplement log2

log2 as currently implemented applies only to u64 trace_event_field
derived fields, and assumes that anything it's applied to is a u64
field.

To prepare for synthetic fields like latencies, log2 should be
applicable to those as well, so take the opportunity now to fix the
current problems as well as expand to more general uses.

log2 should be thought of as a chaining function rather than a field
type.  To enable this as well as possible future function
implementations, add a hist_field operand array into the hist_field
definition for this purpose, and make use of it to implement the log2
'function'.

Link: http://lkml.kernel.org/r/b47f93fc0b87b36eccf716b0c018f3a71e1f1111.1506105045.git.tom.zanussi@linux.intel.com

Signed-off-by: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_events_hist.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 34edf5fd85fd..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
 
+#define HIST_FIELD_OPERANDS_MAX	2
+
 struct hist_field {
 	struct ftrace_event_field	*field;
 	unsigned long			flags;
 	hist_field_fn_t			fn;
 	unsigned int			size;
 	unsigned int			offset;
+	unsigned int                    is_signed;
+	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
 };
 
 static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
 
 static u64 hist_field_log2(struct hist_field *hist_field, void *event)
 {
-	u64 val = *(u64 *)(event + hist_field->field->offset);
+	struct hist_field *operand = hist_field->operands[0];
+
+	u64 val = operand->fn(operand, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
@@ -156,6 +162,8 @@ static const char *hist_field_name(struct hist_field *field,
 
 	if (field->field)
 		field_name = field->field->name;
+	else if (field->flags & HIST_FIELD_FL_LOG2)
+		field_name = hist_field_name(field->operands[0], ++level);
 
 	if (field_name == NULL)
 		field_name = "";
@@ -357,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
 	.elt_init	= hist_trigger_elt_comm_init,
 };
 
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+			       unsigned int level)
 {
+	unsigned int i;
+
+	if (level > 2)
+		return;
+
+	if (!hist_field)
+		return;
+
+	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+		destroy_hist_field(hist_field->operands[i], level + 1);
+
 	kfree(hist_field);
 }
 
@@ -385,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	}
 
 	if (flags & HIST_FIELD_FL_LOG2) {
+		unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
 		hist_field->fn = hist_field_log2;
+		hist_field->operands[0] = create_hist_field(field, fl);
+		hist_field->size = hist_field->operands[0]->size;
 		goto out;
 	}
 
@@ -405,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
 		if (!hist_field->fn) {
-			destroy_hist_field(hist_field);
+			destroy_hist_field(hist_field, 0);
 			return NULL;
 		}
 	}
@@ -422,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 
 	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
 		if (hist_data->fields[i]) {
-			destroy_hist_field(hist_data->fields[i]);
+			destroy_hist_field(hist_data->fields[i], 0);
 			hist_data->fields[i] = NULL;
 		}
 	}
-- 
cgit v1.3-14-g43fede


From b35bd0d9f8a8ea17aae40893e18274d191a2d2c5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 30 Sep 2017 23:39:05 -0600
Subject: sysctl: remove /proc/sys/vm/nr_pdflush_threads

This tunable has been obsolete since 2.6.32, and writes to the
file have been failing and complaining in dmesg since then:

nr_pdflush_threads exported in /proc is scheduled for removal

That was 8 years ago. Remove the file ABI obsolete notice, and
the sysfs file.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads | 5 -----
 kernel/sysctl.c                                           | 5 -----
 2 files changed, 10 deletions(-)
 delete mode 100644 Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads

(limited to 'kernel')

diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
deleted file mode 100644
index b0b0eeb20fe3..000000000000
--- a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads
+++ /dev/null
@@ -1,5 +0,0 @@
-What:		/proc/sys/vm/nr_pdflush_threads
-Date:		June 2012
-Contact:	Wanpeng Li <liwp@linux.vnet.ibm.com>
-Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush
-             exported in /proc/sys/vm/ should be removed.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..a5dd8d82c253 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1344,11 +1344,6 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= dirtytime_interval_handler,
 		.extra1		= &zero,
 	},
-	{
-		.procname       = "nr_pdflush_threads",
-		.mode           = 0444 /* read-only */,
-		.proc_handler   = pdflush_proc_obsolete,
-	},
 	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
-- 
cgit v1.3-14-g43fede


From 6cafbe159416822f6d3dfd711bf4c39050c650ba Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 20 Jun 2017 10:44:58 -0400
Subject: ftrace: Add a ftrace_free_mem() function for modules to use

In order to be able to trace module init functions, the module code needs to
tell ftrace what is being freed when the init sections are freed. Use the
code that the main init calls to tell ftrace to free the main init sections.
This requires passing in a start and end address to free.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 ++
 kernel/trace/ftrace.c  | 14 +++++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 2e028854bac7..47fc404ad233 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -151,8 +151,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
+void ftrace_free_mem(void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(void *start, void *end) { }
 #endif
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..84cb5928665a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5868,10 +5868,10 @@ void ftrace_module_init(struct module *mod)
 }
 #endif /* CONFIG_MODULES */
 
-void __init ftrace_free_init_mem(void)
+void ftrace_free_mem(void *start_ptr, void *end_ptr)
 {
-	unsigned long start = (unsigned long)(&__init_begin);
-	unsigned long end = (unsigned long)(&__init_end);
+	unsigned long start = (unsigned long)(start_ptr);
+	unsigned long end = (unsigned long)(end_ptr);
 	struct ftrace_page **last_pg = &ftrace_pages_start;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
@@ -5913,6 +5913,14 @@ void __init ftrace_free_init_mem(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+void __init ftrace_free_init_mem(void)
+{
+	void *start = (void *)(&__init_begin);
+	void *end = (void *)(&__init_end);
+
+	ftrace_free_mem(start, end);
+}
+
 void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
-- 
cgit v1.3-14-g43fede


From 324bda9e6c5add86ba2e1066476481c48132aca0 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:21 -0700
Subject: bpf: multi program support for cgroup+bpf

introduce BPF_F_ALLOW_MULTI flag that can be used to attach multiple
bpf programs to a cgroup.

The difference between three possible flags for BPF_PROG_ATTACH command:
- NONE(default): No further bpf programs allowed in the subtree.
- BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
  the program in this cgroup yields to sub-cgroup program.
- BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
  that cgroup program gets run in addition to the program in this cgroup.

NONE and BPF_F_ALLOW_OVERRIDE existed before. This patch doesn't
change their behavior. It only clarifies the semantics in relation
to new flag.

Only one program is allowed to be attached to a cgroup with
NONE or BPF_F_ALLOW_OVERRIDE flag.
Multiple programs are allowed to be attached to a cgroup with
BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
(those that were attached first, run first)
The programs of sub-cgroup are executed first, then programs of
this cgroup and then programs of parent cgroup.
All eligible programs are executed regardless of return code from
earlier programs.

To allow efficient execution of multiple programs attached to a cgroup
and to avoid penalizing cgroups without any programs attached
introduce 'struct bpf_prog_array' which is RCU protected array
of pointers to bpf programs.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  46 +++--
 include/linux/bpf.h        |  32 ++++
 include/linux/filter.h     |   2 +-
 include/uapi/linux/bpf.h   |  42 +++-
 kernel/bpf/cgroup.c        | 467 ++++++++++++++++++++++++++++++++-------------
 kernel/bpf/core.c          |  31 +++
 kernel/bpf/syscall.c       |  37 ++--
 kernel/cgroup/cgroup.c     |  28 ++-
 8 files changed, 516 insertions(+), 169 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index d41d40ac3efd..102e56fbb6de 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -14,27 +14,42 @@ struct bpf_sock_ops_kern;
 extern struct static_key_false cgroup_bpf_enabled_key;
 #define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
 
+struct bpf_prog_list {
+	struct list_head node;
+	struct bpf_prog *prog;
+};
+
+struct bpf_prog_array;
+
 struct cgroup_bpf {
-	/*
-	 * Store two sets of bpf_prog pointers, one for programs that are
-	 * pinned directly to this cgroup, and one for those that are effective
-	 * when this cgroup is accessed.
+	/* array of effective progs in this cgroup */
+	struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE];
+
+	/* attached progs to this cgroup and attach flags
+	 * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will
+	 * have either zero or one element
+	 * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS
 	 */
-	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
-	struct bpf_prog __rcu *effective[MAX_BPF_ATTACH_TYPE];
-	bool disallow_override[MAX_BPF_ATTACH_TYPE];
+	struct list_head progs[MAX_BPF_ATTACH_TYPE];
+	u32 flags[MAX_BPF_ATTACH_TYPE];
+
+	/* temp storage for effective prog array used by prog_attach/detach */
+	struct bpf_prog_array __rcu *inactive;
 };
 
 void cgroup_bpf_put(struct cgroup *cgrp);
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+int cgroup_bpf_inherit(struct cgroup *cgrp);
 
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool overridable);
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags);
 
-/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable);
+/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -96,8 +111,7 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 
 struct cgroup_bpf {};
 static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
-static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
-				      struct cgroup *parent) {}
+static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 252f4bc9eb25..a6964b75f070 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -241,6 +241,38 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
 int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr);
 
+/* an array of programs to be executed under rcu_lock.
+ *
+ * Typical usage:
+ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN);
+ *
+ * the structure returned by bpf_prog_array_alloc() should be populated
+ * with program pointers and the last pointer must be NULL.
+ * The user has to keep refcnt on the program and make sure the program
+ * is removed from the array before bpf_prog_put().
+ * The 'struct bpf_prog_array *' should only be replaced with xchg()
+ * since other cpus are walking the array of pointers in parallel.
+ */
+struct bpf_prog_array {
+	struct rcu_head rcu;
+	struct bpf_prog *progs[0];
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	({						\
+		struct bpf_prog **_prog;		\
+		u32 _ret = 1;				\
+		rcu_read_lock();			\
+		_prog = rcu_dereference(array)->progs;	\
+		for (; *_prog; _prog++)			\
+			_ret &= func(*_prog, ctx);	\
+		rcu_read_unlock();			\
+		_ret;					\
+	 })
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 911d454af107..2d2db394b0ca 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -481,7 +481,7 @@ struct sk_filter {
 	struct bpf_prog	*prog;
 };
 
-#define BPF_PROG_RUN(filter, ctx)  (*filter->bpf_func)(ctx, filter->insnsi)
+#define BPF_PROG_RUN(filter, ctx)  (*(filter)->bpf_func)(ctx, (filter)->insnsi)
 
 #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6d2137b4cf38..762f74bc6c47 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -143,11 +143,47 @@ enum bpf_attach_type {
 
 #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE
 
-/* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command
- * to the given target_fd cgroup the descendent cgroup will be able to
- * override effective bpf program that was inherited from this cgroup
+/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
+ *
+ * NONE(default): No further bpf programs allowed in the subtree.
+ *
+ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program,
+ * the program in this cgroup yields to sub-cgroup program.
+ *
+ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program,
+ * that cgroup program gets run in addition to the program in this cgroup.
+ *
+ * Only one program is allowed to be attached to a cgroup with
+ * NONE or BPF_F_ALLOW_OVERRIDE flag.
+ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will
+ * release old program and attach the new one. Attach flags has to match.
+ *
+ * Multiple programs are allowed to be attached to a cgroup with
+ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order
+ * (those that were attached first, run first)
+ * The programs of sub-cgroup are executed first, then programs of
+ * this cgroup and then programs of parent cgroup.
+ * When children program makes decision (like picking TCP CA or sock bind)
+ * parent program has a chance to override it.
+ *
+ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups.
+ * A cgroup with NONE doesn't allow any programs in sub-cgroups.
+ * Ex1:
+ * cgrp1 (MULTI progs A, B) ->
+ *    cgrp2 (OVERRIDE prog C) ->
+ *      cgrp3 (MULTI prog D) ->
+ *        cgrp4 (OVERRIDE prog E) ->
+ *          cgrp5 (NONE prog F)
+ * the event in cgrp5 triggers execution of F,D,A,B in that order.
+ * if prog F is detached, the execution is E,D,A,B
+ * if prog F and D are detached, the execution is E,A,B
+ * if prog F, E and D are detached, the execution is C,A,B
+ *
+ * All eligible programs are executed regardless of return code from
+ * earlier programs.
  */
 #define BPF_F_ALLOW_OVERRIDE	(1U << 0)
+#define BPF_F_ALLOW_MULTI	(1U << 1)
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
  * verifier will perform strict alignment checking as if the kernel
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..6b7500bbdb53 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,361 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	u32 old_flags;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	old_flags = cgrp->bpf.flags[type];
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
 }
 
 /**
@@ -171,36 +403,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +443,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
+	int ret;
 
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +471,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
-
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
+	int ret;
 
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..6b49e1991ae7 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1381,6 +1381,37 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b927da66f653..51bee695d32c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1168,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1181,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1212,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1225,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1237,23 +1242,33 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..57eb866ae78d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -4713,6 +4716,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4753,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5736,14 +5741,23 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
-- 
cgit v1.3-14-g43fede


From 468e2f64d220fe2dc11caa2bcb9b3a1e50fc7321 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:22 -0700
Subject: bpf: introduce BPF_PROG_QUERY command

introduce BPF_PROG_QUERY command to retrieve a set of either
attached programs to given cgroup or a set of effective programs
that will execute for events within a cgroup

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
for cgroup bits
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h |  4 ++++
 include/linux/bpf.h        |  3 +++
 include/uapi/linux/bpf.h   | 13 +++++++++++++
 kernel/bpf/cgroup.c        | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/core.c          | 38 ++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c       | 34 ++++++++++++++++++++++++++++++++++
 kernel/cgroup/cgroup.c     | 10 ++++++++++
 7 files changed, 148 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 102e56fbb6de..359b6f5d3d90 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -44,12 +44,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 			enum bpf_attach_type type, u32 flags);
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr);
 
 /* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
 int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
 int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 		      enum bpf_attach_type type, u32 flags);
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index a6964b75f070..a67daea731ab 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -260,6 +260,9 @@ struct bpf_prog_array {
 
 struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
 void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt);
 
 #define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
 	({						\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 762f74bc6c47..cb2b9f95160a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -92,6 +92,7 @@ enum bpf_cmd {
 	BPF_PROG_GET_FD_BY_ID,
 	BPF_MAP_GET_FD_BY_ID,
 	BPF_OBJ_GET_INFO_BY_FD,
+	BPF_PROG_QUERY,
 };
 
 enum bpf_map_type {
@@ -211,6 +212,9 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE		(1U << 2)
 
+/* flags for BPF_PROG_QUERY */
+#define BPF_F_QUERY_EFFECTIVE	(1U << 0)
+
 #define BPF_OBJ_NAME_LEN 16U
 
 union bpf_attr {
@@ -289,6 +293,15 @@ union bpf_attr {
 		__u32		info_len;
 		__aligned_u64	info;
 	} info;
+
+	struct { /* anonymous struct used by BPF_PROG_QUERY command */
+		__u32		target_fd;	/* container object to query */
+		__u32		attach_type;
+		__u32		query_flags;
+		__u32		attach_flags;
+		__aligned_u64	prog_ids;
+		__u32		prog_cnt;
+	} query;
 } __attribute__((aligned(8)));
 
 /* BPF helper function descriptions:
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6b7500bbdb53..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -384,6 +384,52 @@ cleanup:
 	return err;
 }
 
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
+}
+
 /**
  * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
  * @sk: The socket sending or receiving traffic
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 6b49e1991ae7..eba966c09053 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1412,6 +1412,44 @@ void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
 	kfree_rcu(progs, rcu);
 }
 
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 51bee695d32c..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1272,6 +1272,37 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1568,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 57eb866ae78d..269512b94a94 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5761,4 +5761,14 @@ int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
-- 
cgit v1.3-14-g43fede


From 390ee7e29fc8e6e90d3065b00afb047c4ee552f9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 2 Oct 2017 22:50:23 -0700
Subject: bpf: enforce return code for cgroup-bpf programs

with addition of tnum logic the verifier got smart enough and
we can enforce return codes at program load time.
For now do so for cgroup-bpf program types.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       | 40 ++++++++++++++++
 tools/testing/selftests/bpf/test_verifier.c | 72 +++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4cf9b72c59a0..52b022310f6a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3073,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	return 0;
 }
 
+static int check_return_code(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *reg;
+	struct tnum range = tnum_range(0, 1);
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_SOCK_OPS:
+		break;
+	default:
+		return 0;
+	}
+
+	reg = &env->cur_state.regs[BPF_REG_0];
+	if (reg->type != SCALAR_VALUE) {
+		verbose("At program exit the register R0 is not a known value (%s)\n",
+			reg_type_str[reg->type]);
+		return -EINVAL;
+	}
+
+	if (!tnum_in(range, reg->var_off)) {
+		verbose("At program exit the register R0 ");
+		if (!tnum_is_unknown(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose("has value %s", tn_buf);
+		} else {
+			verbose("has unknown scalar value");
+		}
+		verbose(" should have been 0 or 1\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 /* non-recursive DFS pseudo code
  * 1  procedure DFS-iterative(G,v):
  * 2      label v as discovered
@@ -3863,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env)
 					return -EACCES;
 				}
 
+				err = check_return_code(env);
+				if (err)
+					return err;
 process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 290d5056c165..cc91d0159f43 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -6892,6 +6892,78 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_exit with invalid return code. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0xffffffff)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0x3)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x2; 0x0)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 is not a known value (ctx)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has unknown scalar value",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.3-14-g43fede


From 58e1177b4cd10b0d358faf7d7ebb3779f98bc3ea Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:26:55 -0700
Subject: timer: Convert schedule_timeout() to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new from_timer() helper and passing
the timer pointer explicitly. Since this special timer is on the stack, it
needs to have a wrapper structure to carry state once .data is eliminated.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-2-git-send-email-keescook@chromium.org
---
 include/linux/timer.h |  8 ++++++++
 kernel/time/timer.c   | 26 +++++++++++++++++++-------
 2 files changed, 27 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 6383c528b148..5ef5c9e41a09 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -179,6 +179,14 @@ static inline void timer_setup(struct timer_list *timer,
 		      (TIMER_DATA_TYPE)timer, flags);
 }
 
+static inline void timer_setup_on_stack(struct timer_list *timer,
+			       void (*callback)(struct timer_list *),
+			       unsigned int flags)
+{
+	__setup_timer_on_stack(timer, (TIMER_FUNC_TYPE)callback,
+			       (TIMER_DATA_TYPE)timer, flags);
+}
+
 #define from_timer(var, callback_timer, timer_fieldname) \
 	container_of(callback_timer, typeof(*var), timer_fieldname)
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index f2674a056c26..38613ced2324 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1668,9 +1668,20 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
-static void process_timeout(unsigned long __data)
+/*
+ * Since schedule_timeout()'s timer is defined on the stack, it must store
+ * the target task on the stack as well.
+ */
+struct process_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void process_timeout(struct timer_list *t)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct process_timer *timeout = from_timer(timeout, t, timer);
+
+	wake_up_process(timeout->task);
 }
 
 /**
@@ -1704,7 +1715,7 @@ static void process_timeout(unsigned long __data)
  */
 signed long __sched schedule_timeout(signed long timeout)
 {
-	struct timer_list timer;
+	struct process_timer timer;
 	unsigned long expire;
 
 	switch (timeout)
@@ -1738,13 +1749,14 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire, false);
+	timer.task = current;
+	timer_setup_on_stack(&timer.timer, process_timeout, 0);
+	__mod_timer(&timer.timer, expire, false);
 	schedule();
-	del_singleshot_timer_sync(&timer);
+	del_singleshot_timer_sync(&timer.timer);
 
 	/* Remove the timer from the object tracker */
-	destroy_timer_on_stack(&timer);
+	destroy_timer_on_stack(&timer.timer);
 
 	timeout = expire - jiffies;
 
-- 
cgit v1.3-14-g43fede


From 5cd79d6abd2c142352dead0e3df04e86ee32f5d3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:00 -0700
Subject: timer: Remove users of TIMER_DEFERRED_INITIALIZER

This removes uses of TIMER_DEFERRED_INITIALIZER and chooses a location
to call timer_setup() from before add_timer() or mod_timer() is called.
Adjusts callbacks to use from_timer() as needed.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-7-git-send-email-keescook@chromium.org
---
 arch/s390/kernel/lgr.c      | 6 +++---
 arch/s390/kernel/topology.c | 6 +++---
 kernel/workqueue.c          | 8 +++-----
 3 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index ae7dff110054..bf9622f0e6b1 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -153,14 +153,13 @@ static void lgr_timer_set(void);
 /*
  * LGR timer callback
  */
-static void lgr_timer_fn(unsigned long ignored)
+static void lgr_timer_fn(struct timer_list *unused)
 {
 	lgr_info_log();
 	lgr_timer_set();
 }
 
-static struct timer_list lgr_timer =
-	TIMER_DEFERRED_INITIALIZER(lgr_timer_fn, 0, 0);
+static struct timer_list lgr_timer;
 
 /*
  * Setup next LGR timer
@@ -181,6 +180,7 @@ static int __init lgr_init(void)
 	debug_register_view(lgr_dbf, &debug_hex_ascii_view);
 	lgr_info_get(&lgr_info_last);
 	debug_event(lgr_dbf, 1, &lgr_info_last, sizeof(lgr_info_last));
+	timer_setup(&lgr_timer, lgr_timer_fn, TIMER_DEFERRABLE);
 	lgr_timer_set();
 	return 0;
 }
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index ed0bdd220e1a..d7ece9888c29 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -320,15 +320,14 @@ static void topology_flush_work(void)
 	flush_work(&topology_work);
 }
 
-static void topology_timer_fn(unsigned long ignored)
+static void topology_timer_fn(struct timer_list *unused)
 {
 	if (ptf(PTF_CHECK))
 		topology_schedule_update();
 	set_topology_timer();
 }
 
-static struct timer_list topology_timer =
-	TIMER_DEFERRED_INITIALIZER(topology_timer_fn, 0, 0);
+static struct timer_list topology_timer;
 
 static atomic_t topology_poll = ATOMIC_INIT(0);
 
@@ -597,6 +596,7 @@ static struct ctl_table topology_dir_table[] = {
 
 static int __init topology_init(void)
 {
+	timer_setup(&topology_timer, topology_timer_fn, TIMER_DEFERRABLE);
 	if (MACHINE_HAS_TOPOLOGY)
 		set_topology_timer();
 	else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..a5361fc6215d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5390,11 +5390,8 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
  */
 #ifdef CONFIG_WQ_WATCHDOG
 
-static void wq_watchdog_timer_fn(unsigned long data);
-
 static unsigned long wq_watchdog_thresh = 30;
-static struct timer_list wq_watchdog_timer =
-	TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0);
+static struct timer_list wq_watchdog_timer;
 
 static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;
@@ -5408,7 +5405,7 @@ static void wq_watchdog_reset_touched(void)
 		per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
 }
 
-static void wq_watchdog_timer_fn(unsigned long data)
+static void wq_watchdog_timer_fn(struct timer_list *unused)
 {
 	unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
 	bool lockup_detected = false;
@@ -5510,6 +5507,7 @@ module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
 
 static void wq_watchdog_init(void)
 {
+	timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
 	wq_watchdog_set_thresh(wq_watchdog_thresh);
 }
 
-- 
cgit v1.3-14-g43fede


From 1d27e3e2252ba9d949ca82fbdb73cde102cb2067 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:04 -0700
Subject: timer: Remove expires and data arguments from DEFINE_TIMER

Drop the arguments from the macro and adjust all callers with the
following script:

  perl -pi -e 's/DEFINE_TIMER\((.*), 0, 0\);/DEFINE_TIMER($1);/g;' \
    $(git grep DEFINE_TIMER | cut -d: -f1 | sort -u | grep -v timer.h)

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org> # for m68k parts
Acked-by: Guenter Roeck <linux@roeck-us.net> # for watchdog parts
Acked-by: David S. Miller <davem@davemloft.net> # for networking parts
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Kalle Valo <kvalo@codeaurora.org> # for wireless parts
Acked-by: Arnd Bergmann <arnd@arndb.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Michael Reed <mdr@sgi.com>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: netdev@vger.kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-11-git-send-email-keescook@chromium.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/arm/mach-ixp4xx/dsmg600-setup.c      | 2 +-
 arch/arm/mach-ixp4xx/nas100d-setup.c      | 2 +-
 arch/m68k/amiga/amisound.c                | 2 +-
 arch/m68k/mac/macboing.c                  | 2 +-
 arch/mips/mti-malta/malta-display.c       | 2 +-
 arch/parisc/kernel/pdc_cons.c             | 2 +-
 arch/s390/mm/cmm.c                        | 2 +-
 drivers/atm/idt77105.c                    | 4 ++--
 drivers/atm/iphase.c                      | 2 +-
 drivers/block/ataflop.c                   | 8 ++++----
 drivers/char/dtlk.c                       | 2 +-
 drivers/char/hangcheck-timer.c            | 2 +-
 drivers/char/nwbutton.c                   | 2 +-
 drivers/char/rtc.c                        | 2 +-
 drivers/input/touchscreen/s3c2410_ts.c    | 2 +-
 drivers/net/cris/eth_v10.c                | 6 +++---
 drivers/net/hamradio/yam.c                | 2 +-
 drivers/net/wireless/atmel/at76c50x-usb.c | 2 +-
 drivers/staging/speakup/main.c            | 2 +-
 drivers/staging/speakup/synth.c           | 2 +-
 drivers/tty/cyclades.c                    | 2 +-
 drivers/tty/isicom.c                      | 2 +-
 drivers/tty/moxa.c                        | 2 +-
 drivers/tty/rocket.c                      | 2 +-
 drivers/tty/vt/keyboard.c                 | 2 +-
 drivers/tty/vt/vt.c                       | 2 +-
 drivers/watchdog/alim7101_wdt.c           | 2 +-
 drivers/watchdog/machzwd.c                | 2 +-
 drivers/watchdog/mixcomwd.c               | 2 +-
 drivers/watchdog/sbc60xxwdt.c             | 2 +-
 drivers/watchdog/sc520_wdt.c              | 2 +-
 drivers/watchdog/via_wdt.c                | 2 +-
 drivers/watchdog/w83877f_wdt.c            | 2 +-
 drivers/xen/grant-table.c                 | 2 +-
 fs/pstore/platform.c                      | 2 +-
 include/linux/timer.h                     | 4 ++--
 kernel/irq/spurious.c                     | 2 +-
 lib/random32.c                            | 2 +-
 net/atm/mpc.c                             | 2 +-
 net/decnet/dn_route.c                     | 2 +-
 net/ipv6/ip6_flowlabel.c                  | 2 +-
 net/netrom/nr_loopback.c                  | 2 +-
 security/keys/gc.c                        | 2 +-
 sound/oss/midibuf.c                       | 2 +-
 sound/oss/soundcard.c                     | 2 +-
 sound/oss/sys_timer.c                     | 2 +-
 sound/oss/uart6850.c                      | 2 +-
 47 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index b3bd0e137f6d..b3689a141ec6 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -174,7 +174,7 @@ static int power_button_countdown;
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
 static void dsmg600_power_handler(unsigned long data);
-static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler, 0, 0);
+static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler);
 
 static void dsmg600_power_handler(unsigned long data)
 {
diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c
index 4e0f762bc651..562d05f9888e 100644
--- a/arch/arm/mach-ixp4xx/nas100d-setup.c
+++ b/arch/arm/mach-ixp4xx/nas100d-setup.c
@@ -197,7 +197,7 @@ static int power_button_countdown;
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
 static void nas100d_power_handler(unsigned long data);
-static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler, 0, 0);
+static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler);
 
 static void nas100d_power_handler(unsigned long data)
 {
diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index 90a60d758f8b..a23f48181fd6 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -66,7 +66,7 @@ void __init amiga_init_sound(void)
 }
 
 static void nosound( unsigned long ignored );
-static DEFINE_TIMER(sound_timer, nosound, 0, 0);
+static DEFINE_TIMER(sound_timer, nosound);
 
 void amiga_mksound( unsigned int hz, unsigned int ticks )
 {
diff --git a/arch/m68k/mac/macboing.c b/arch/m68k/mac/macboing.c
index ffaa1f6439ae..9a52aff183d0 100644
--- a/arch/m68k/mac/macboing.c
+++ b/arch/m68k/mac/macboing.c
@@ -56,7 +56,7 @@ static void ( *mac_special_bell )( unsigned int, unsigned int, unsigned int );
 /*
  * our timer to start/continue/stop the bell
  */
-static DEFINE_TIMER(mac_sound_timer, mac_nosound, 0, 0);
+static DEFINE_TIMER(mac_sound_timer, mac_nosound);
 
 /*
  * Sort of initialize the sound chip (called from mac_mksound on the first
diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c
index ac813158b9b8..063de44675ce 100644
--- a/arch/mips/mti-malta/malta-display.c
+++ b/arch/mips/mti-malta/malta-display.c
@@ -37,7 +37,7 @@ void mips_display_message(const char *str)
 }
 
 static void scroll_display_message(unsigned long unused);
-static DEFINE_TIMER(mips_scroll_timer, scroll_display_message, 0, 0);
+static DEFINE_TIMER(mips_scroll_timer, scroll_display_message);
 
 static void scroll_display_message(unsigned long unused)
 {
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 10a5ae9553fd..27a2dd616a7d 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -92,7 +92,7 @@ static int pdc_console_setup(struct console *co, char *options)
 #define PDC_CONS_POLL_DELAY (30 * HZ / 1000)
 
 static void pdc_console_poll(unsigned long unused);
-static DEFINE_TIMER(pdc_console_timer, pdc_console_poll, 0, 0);
+static DEFINE_TIMER(pdc_console_timer, pdc_console_poll);
 static struct tty_port tty_port;
 
 static int pdc_console_tty_open(struct tty_struct *tty, struct file *filp)
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 829c63dbc81a..2dbdcd85b68f 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -56,7 +56,7 @@ static DEFINE_SPINLOCK(cmm_lock);
 
 static struct task_struct *cmm_thread_ptr;
 static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait);
-static DEFINE_TIMER(cmm_timer, NULL, 0, 0);
+static DEFINE_TIMER(cmm_timer, NULL);
 
 static void cmm_timer_fn(unsigned long);
 static void cmm_set_timer(void);
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index 082aa02abc57..57af9fd198e4 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -49,8 +49,8 @@ static void idt77105_stats_timer_func(unsigned long);
 static void idt77105_restart_timer_func(unsigned long);
 
 
-static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func, 0, 0);
-static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func, 0, 0);
+static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func);
+static DEFINE_TIMER(restart_timer, idt77105_restart_timer_func);
 static int start_timer = 1;
 static struct idt77105_priv *idt77105_all = NULL;
 
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index fc72b763fdd7..ad6b582c268e 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -76,7 +76,7 @@ static IADEV *ia_dev[8];
 static struct atm_dev *_ia_dev[8];
 static int iadev_count;
 static void ia_led_timer(unsigned long arg);
-static DEFINE_TIMER(ia_timer, ia_led_timer, 0, 0);
+static DEFINE_TIMER(ia_timer, ia_led_timer);
 static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ;
 static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ;
 static uint IADebugFlag = /* IF_IADBG_ERR | IF_IADBG_CBR| IF_IADBG_INIT_ADAPTER
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 92da886180aa..ae596e55bcb6 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -373,10 +373,10 @@ static void floppy_release(struct gendisk *disk, fmode_t mode);
 
 /************************* End of Prototypes **************************/
 
-static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer, 0, 0);
-static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0);
-static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0);
-static DEFINE_TIMER(fd_timer, check_change, 0, 0);
+static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer);
+static DEFINE_TIMER(readtrack_timer, fd_readtrack_check);
+static DEFINE_TIMER(timeout_timer, fd_times_out);
+static DEFINE_TIMER(fd_timer, check_change);
 	
 static void fd_end_request_cur(blk_status_t err)
 {
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 58471394beb9..1a0385ed6417 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -84,7 +84,7 @@ static int dtlk_has_indexing;
 static unsigned int dtlk_portlist[] =
 {0x25e, 0x29e, 0x2de, 0x31e, 0x35e, 0x39e, 0};
 static wait_queue_head_t dtlk_process_list;
-static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick, 0, 0);
+static DEFINE_TIMER(dtlk_timer, dtlk_timer_tick);
 
 /* prototypes for file_operations struct */
 static ssize_t dtlk_read(struct file *, char __user *,
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 5406b90bf626..5b8db2ed844d 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -124,7 +124,7 @@ static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
 
 static void hangcheck_fire(unsigned long);
 
-static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire, 0, 0);
+static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire);
 
 static void hangcheck_fire(unsigned long data)
 {
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index e6d0d271c58c..44006ed9558f 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -27,7 +27,7 @@ static void button_sequence_finished (unsigned long parameters);
 
 static int button_press_count;		/* The count of button presses */
 /* Times for the end of a sequence */
-static DEFINE_TIMER(button_timer, button_sequence_finished, 0, 0);
+static DEFINE_TIMER(button_timer, button_sequence_finished);
 static DECLARE_WAIT_QUEUE_HEAD(button_wait_queue); /* Used for blocking read */
 static char button_output_buffer[32];	/* Stores data to write out of device */
 static int bcount;			/* The number of bytes in the buffer */
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 974d48927b07..616871e68e09 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -137,7 +137,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rtc_wait);
 #ifdef RTC_IRQ
 static void rtc_dropped_irq(unsigned long data);
 
-static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq, 0, 0);
+static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq);
 #endif
 
 static ssize_t rtc_read(struct file *file, char __user *buf,
diff --git a/drivers/input/touchscreen/s3c2410_ts.c b/drivers/input/touchscreen/s3c2410_ts.c
index 3b3db8c868e0..d3265b6b58b8 100644
--- a/drivers/input/touchscreen/s3c2410_ts.c
+++ b/drivers/input/touchscreen/s3c2410_ts.c
@@ -145,7 +145,7 @@ static void touch_timer_fire(unsigned long data)
 	}
 }
 
-static DEFINE_TIMER(touch_timer, touch_timer_fire, 0, 0);
+static DEFINE_TIMER(touch_timer, touch_timer_fire);
 
 /**
  * stylus_irq - touchscreen stylus event interrupt
diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c
index 017f48cdcab9..1fcc86fa4e05 100644
--- a/drivers/net/cris/eth_v10.c
+++ b/drivers/net/cris/eth_v10.c
@@ -165,8 +165,8 @@ static unsigned int network_rec_config_shadow = 0;
 static unsigned int network_tr_ctrl_shadow = 0;
 
 /* Network speed indication. */
-static DEFINE_TIMER(speed_timer, NULL, 0, 0);
-static DEFINE_TIMER(clear_led_timer, NULL, 0, 0);
+static DEFINE_TIMER(speed_timer, NULL);
+static DEFINE_TIMER(clear_led_timer, NULL);
 static int current_speed; /* Speed read from transceiver */
 static int current_speed_selection; /* Speed selected by user */
 static unsigned long led_next_time;
@@ -174,7 +174,7 @@ static int led_active;
 static int rx_queue_len;
 
 /* Duplex */
-static DEFINE_TIMER(duplex_timer, NULL, 0, 0);
+static DEFINE_TIMER(duplex_timer, NULL);
 static int full_duplex;
 static enum duplex current_duplex;
 
diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c
index 7a7c5224a336..104f71fa9c5e 100644
--- a/drivers/net/hamradio/yam.c
+++ b/drivers/net/hamradio/yam.c
@@ -157,7 +157,7 @@ static struct net_device *yam_devs[NR_PORTS];
 
 static struct yam_mcs *yam_data;
 
-static DEFINE_TIMER(yam_timer, NULL, 0, 0);
+static DEFINE_TIMER(yam_timer, NULL);
 
 /* --------------------------------------------------------------------- */
 
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index 94bf01f8b2a8..ede89d4ffc88 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -519,7 +519,7 @@ exit:
 /* LED trigger */
 static int tx_activity;
 static void at76_ledtrig_tx_timerfunc(unsigned long data);
-static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc, 0, 0);
+static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc);
 DEFINE_LED_TRIGGER(ledtrig_tx);
 
 static void at76_ledtrig_tx_timerfunc(unsigned long data)
diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 56f7be6af1f6..585925bb49a4 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c
@@ -1165,7 +1165,7 @@ static const int NUM_CTL_LABELS = (MSG_CTL_END - MSG_CTL_START + 1);
 
 static void read_all_doc(struct vc_data *vc);
 static void cursor_done(u_long data);
-static DEFINE_TIMER(cursor_timer, cursor_done, 0, 0);
+static DEFINE_TIMER(cursor_timer, cursor_done);
 
 static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
 {
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index a1ca68c76579..6ddd3fc3f08d 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -158,7 +158,7 @@ static void thread_wake_up(u_long data)
 	wake_up_interruptible_all(&speakup_event);
 }
 
-static DEFINE_TIMER(thread_timer, thread_wake_up, 0, 0);
+static DEFINE_TIMER(thread_timer, thread_wake_up);
 
 void synth_start(void)
 {
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index d272bc4e7fb5..dac8a1a8e4ac 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -283,7 +283,7 @@ static void cyz_poll(unsigned long);
 /* The Cyclades-Z polling cycle is defined by this variable */
 static long cyz_polling_cycle = CZ_DEF_POLL;
 
-static DEFINE_TIMER(cyz_timerlist, cyz_poll, 0, 0);
+static DEFINE_TIMER(cyz_timerlist, cyz_poll);
 
 #else				/* CONFIG_CYZ_INTR */
 static void cyz_rx_restart(unsigned long);
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index 61ecdd6b2fc2..40af32108ff5 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -177,7 +177,7 @@ static struct tty_driver *isicom_normal;
 static void isicom_tx(unsigned long _data);
 static void isicom_start(struct tty_struct *tty);
 
-static DEFINE_TIMER(tx, isicom_tx, 0, 0);
+static DEFINE_TIMER(tx, isicom_tx);
 
 /*   baud index mappings from linux defns to isi */
 
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 7f3d4cb0341b..93d37655d928 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -428,7 +428,7 @@ static const struct tty_port_operations moxa_port_ops = {
 };
 
 static struct tty_driver *moxaDriver;
-static DEFINE_TIMER(moxaTimer, moxa_poll, 0, 0);
+static DEFINE_TIMER(moxaTimer, moxa_poll);
 
 /*
  * HW init
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 20d79a6007d5..aa695fda1084 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -111,7 +111,7 @@ static struct r_port *rp_table[MAX_RP_PORTS];	       /*  The main repository of
 static unsigned int xmit_flags[NUM_BOARDS];	       /*  Bit significant, indicates port had data to transmit. */
 						       /*  eg.  Bit 0 indicates port 0 has xmit data, ...        */
 static atomic_t rp_num_ports_open;	               /*  Number of serial ports open                           */
-static DEFINE_TIMER(rocket_timer, rp_do_poll, 0, 0);
+static DEFINE_TIMER(rocket_timer, rp_do_poll);
 
 static unsigned long board1;	                       /* ISA addresses, retrieved from rocketport.conf          */
 static unsigned long board2;
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index f4166263bb3a..f974d6340d04 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -250,7 +250,7 @@ static void kd_nosound(unsigned long ignored)
 	input_handler_for_each_handle(&kbd_handler, &zero, kd_sound_helper);
 }
 
-static DEFINE_TIMER(kd_mksound_timer, kd_nosound, 0, 0);
+static DEFINE_TIMER(kd_mksound_timer, kd_nosound);
 
 void kd_mksound(unsigned int hz, unsigned int ticks)
 {
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 2ebaba16f785..602d71630952 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -228,7 +228,7 @@ static int scrollback_delta;
  */
 int (*console_blank_hook)(int);
 
-static DEFINE_TIMER(console_timer, blank_screen_t, 0, 0);
+static DEFINE_TIMER(console_timer, blank_screen_t);
 static int blank_state;
 static int blank_timer_expired;
 enum {
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 3c1f6ac68ea9..18e896eeca62 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -71,7 +71,7 @@ MODULE_PARM_DESC(use_gpio,
 		"Use the gpio watchdog (required by old cobalt boards).");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 9826b59ef734..8a616a57bb90 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -127,7 +127,7 @@ static int zf_action = GEN_RESET;
 static unsigned long zf_is_open;
 static char zf_expect_close;
 static DEFINE_SPINLOCK(zf_port_lock);
-static DEFINE_TIMER(zf_timer, zf_ping, 0, 0);
+static DEFINE_TIMER(zf_timer, zf_ping);
 static unsigned long next_heartbeat;
 
 
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index be86ea359eee..c9e38096ea91 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -105,7 +105,7 @@ static unsigned long mixcomwd_opened; /* long req'd for setbit --RR */
 
 static int watchdog_port;
 static int mixcomwd_timer_alive;
-static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun, 0, 0);
+static DEFINE_TIMER(mixcomwd_timer, mixcomwd_timerfun);
 static char expect_close;
 
 static bool nowayout = WATCHDOG_NOWAYOUT;
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 2eef58a0cf05..8d589939bc84 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -113,7 +113,7 @@ MODULE_PARM_DESC(nowayout,
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 1cfd3f6a13d5..3e9bbaa37bf4 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -124,7 +124,7 @@ MODULE_PARM_DESC(nowayout,
 static __u16 __iomem *wdtmrctl;
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/watchdog/via_wdt.c b/drivers/watchdog/via_wdt.c
index 5f9cbc37520d..ad3c3be13b40 100644
--- a/drivers/watchdog/via_wdt.c
+++ b/drivers/watchdog/via_wdt.c
@@ -68,7 +68,7 @@ static struct resource wdt_res;
 static void __iomem *wdt_mem;
 static unsigned int mmio;
 static void wdt_timer_tick(unsigned long data);
-static DEFINE_TIMER(timer, wdt_timer_tick, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_tick);
 					/* The timer that pings the watchdog */
 static unsigned long next_heartbeat;	/* the next_heartbeat for the timer */
 
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index f0483c75ed32..ba6b680af100 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -98,7 +98,7 @@ MODULE_PARM_DESC(nowayout,
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
 static void wdt_timer_ping(unsigned long);
-static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
+static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 2c6a9114d332..a8721d718186 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -305,7 +305,7 @@ struct deferred_entry {
 };
 static LIST_HEAD(deferred_list);
 static void gnttab_handle_deferred(unsigned long);
-static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0);
+static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
 static void gnttab_handle_deferred(unsigned long unused)
 {
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 2b21d180157c..ec7199e859d2 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -62,7 +62,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
 static int pstore_new_entry;
 
 static void pstore_timefunc(unsigned long);
-static DEFINE_TIMER(pstore_timer, pstore_timefunc, 0, 0);
+static DEFINE_TIMER(pstore_timer, pstore_timefunc);
 
 static void pstore_dowork(struct work_struct *);
 static DECLARE_WORK(pstore_work, pstore_dowork);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index a33220311361..91e5a2cc81b5 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -73,9 +73,9 @@ struct timer_list {
 			__FILE__ ":" __stringify(__LINE__))	\
 	}
 
-#define DEFINE_TIMER(_name, _function, _expires, _data)		\
+#define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER(_function, _expires, _data, 0)
+		__TIMER_INITIALIZER(_function, 0, 0, 0)
 
 void init_timer_key(struct timer_list *timer, unsigned int flags,
 		    const char *name, struct lock_class_key *key);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 061ba7eed4ed..c805e8691c22 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@ static int irqfixup __read_mostly;
 
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 static void poll_spurious_irqs(unsigned long dummy);
-static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
 
diff --git a/lib/random32.c b/lib/random32.c
index fa594b1140e6..6e91b75c113f 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -214,7 +214,7 @@ core_initcall(prandom_init);
 
 static void __prandom_timer(unsigned long dontcare);
 
-static DEFINE_TIMER(seed_timer, __prandom_timer, 0, 0);
+static DEFINE_TIMER(seed_timer, __prandom_timer);
 
 static void __prandom_timer(unsigned long dontcare)
 {
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 5677147209e8..63138c8c2269 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -121,7 +121,7 @@ static struct notifier_block mpoa_notifier = {
 
 struct mpoa_client *mpcs = NULL; /* FIXME */
 static struct atm_mpoa_qos *qos_head = NULL;
-static DEFINE_TIMER(mpc_timer, NULL, 0, 0);
+static DEFINE_TIMER(mpc_timer, NULL);
 
 
 static struct mpoa_client *find_mpc_by_itfnum(int itf)
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 0bd3afd01dd2..6538632fbd03 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -131,7 +131,7 @@ static struct dn_rt_hash_bucket *dn_rt_hash_table;
 static unsigned int dn_rt_hash_mask;
 
 static struct timer_list dn_route_timer;
-static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush, 0, 0);
+static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush);
 int decnet_dst_gc_interval = 2;
 
 static struct dst_ops dn_dst_ops = {
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 8081bafe441b..b39d0908be2e 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -47,7 +47,7 @@ static atomic_t fl_size = ATOMIC_INIT(0);
 static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 
 static void ip6_fl_gc(unsigned long dummy);
-static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc, 0, 0);
+static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 
 /* FL hash table lock: it protects only of GC */
 
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 94d4e922af53..989ae647825e 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -18,7 +18,7 @@
 static void nr_loopback_timer(unsigned long);
 
 static struct sk_buff_head loopback_queue;
-static DEFINE_TIMER(loopback_timer, nr_loopback_timer, 0, 0);
+static DEFINE_TIMER(loopback_timer, nr_loopback_timer);
 
 void __init nr_loopback_init(void)
 {
diff --git a/security/keys/gc.c b/security/keys/gc.c
index 87cb260e4890..8673f7f58ead 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -30,7 +30,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector);
  * Reaper for links from keyrings to dead keys.
  */
 static void key_gc_timer_func(unsigned long);
-static DEFINE_TIMER(key_gc_timer, key_gc_timer_func, 0, 0);
+static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);
 
 static time_t key_gc_next_run = LONG_MAX;
 static struct key_type *key_gc_dead_keytype;
diff --git a/sound/oss/midibuf.c b/sound/oss/midibuf.c
index 701c7625c971..1277df815d5b 100644
--- a/sound/oss/midibuf.c
+++ b/sound/oss/midibuf.c
@@ -52,7 +52,7 @@ static struct midi_parms parms[MAX_MIDI_DEV];
 static void midi_poll(unsigned long dummy);
 
 
-static DEFINE_TIMER(poll_timer, midi_poll, 0, 0);
+static DEFINE_TIMER(poll_timer, midi_poll);
 
 static volatile int open_devs;
 static DEFINE_SPINLOCK(lock);
diff --git a/sound/oss/soundcard.c b/sound/oss/soundcard.c
index b70c7c8f9c5d..4391062e5cfd 100644
--- a/sound/oss/soundcard.c
+++ b/sound/oss/soundcard.c
@@ -662,7 +662,7 @@ static void do_sequencer_timer(unsigned long dummy)
 }
 
 
-static DEFINE_TIMER(seq_timer, do_sequencer_timer, 0, 0);
+static DEFINE_TIMER(seq_timer, do_sequencer_timer);
 
 void request_sound_timer(int count)
 {
diff --git a/sound/oss/sys_timer.c b/sound/oss/sys_timer.c
index d17019d25b99..8a4b5625dba6 100644
--- a/sound/oss/sys_timer.c
+++ b/sound/oss/sys_timer.c
@@ -28,7 +28,7 @@ static unsigned long prev_event_time;
 
 static void     poll_def_tmr(unsigned long dummy);
 static DEFINE_SPINLOCK(lock);
-static DEFINE_TIMER(def_tmr, poll_def_tmr, 0, 0);
+static DEFINE_TIMER(def_tmr, poll_def_tmr);
 
 static unsigned long
 tmr2ticks(int tmr_value)
diff --git a/sound/oss/uart6850.c b/sound/oss/uart6850.c
index eda32d7eddbd..a9d3f7568525 100644
--- a/sound/oss/uart6850.c
+++ b/sound/oss/uart6850.c
@@ -78,7 +78,7 @@ static void (*midi_input_intr) (int dev, unsigned char data);
 static void poll_uart6850(unsigned long dummy);
 
 
-static DEFINE_TIMER(uart6850_timer, poll_uart6850, 0, 0);
+static DEFINE_TIMER(uart6850_timer, poll_uart6850);
 
 static void uart6850_input_loop(void)
 {
-- 
cgit v1.3-14-g43fede


From fe5c3b69b540e3387223a696f327c1bb8880d1ac Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:06 -0700
Subject: kthread: Convert callback to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch kthread to use from_timer() and pass the
timer pointer explicitly.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Len Brown <len.brown@intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-13-git-send-email-keescook@chromium.org
---
 include/linux/kthread.h | 10 +++++-----
 kernel/kthread.c        | 10 ++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 0d622b350d3f..35cbe3b0ce5b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -75,7 +75,7 @@ extern int tsk_fork_get_node(struct task_struct *tsk);
  */
 struct kthread_work;
 typedef void (*kthread_work_func_t)(struct kthread_work *work);
-void kthread_delayed_work_timer_fn(unsigned long __data);
+void kthread_delayed_work_timer_fn(struct timer_list *t);
 
 enum {
 	KTW_FREEZABLE		= 1 << 0,	/* freeze during suspend */
@@ -116,8 +116,8 @@ struct kthread_delayed_work {
 
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
-	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,	\
-				     (unsigned long)&(dwork),		\
+	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+				     (TIMER_DATA_TYPE)&(dwork.timer),	\
 				     TIMER_IRQSAFE),			\
 	}
 
@@ -164,8 +164,8 @@ extern void __kthread_init_worker(struct kthread_worker *worker,
 	do {								\
 		kthread_init_work(&(dwork)->work, (fn));		\
 		__setup_timer(&(dwork)->timer,				\
-			      kthread_delayed_work_timer_fn,		\
-			      (unsigned long)(dwork),			\
+			      (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+			      (TIMER_DATA_TYPE)&(dwork)->timer,		\
 			      TIMER_IRQSAFE);				\
 	} while (0)
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1c19edf82427..ba3992c8c375 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -798,15 +798,14 @@ EXPORT_SYMBOL_GPL(kthread_queue_work);
 /**
  * kthread_delayed_work_timer_fn - callback that queues the associated kthread
  *	delayed work when the timer expires.
- * @__data: pointer to the data associated with the timer
+ * @t: pointer to the expired timer
  *
  * The format of the function is defined by struct timer_list.
  * It should have been called from irqsafe timer with irq already off.
  */
-void kthread_delayed_work_timer_fn(unsigned long __data)
+void kthread_delayed_work_timer_fn(struct timer_list *t)
 {
-	struct kthread_delayed_work *dwork =
-		(struct kthread_delayed_work *)__data;
+	struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
 	struct kthread_work *work = &dwork->work;
 	struct kthread_worker *worker = work->worker;
 
@@ -837,8 +836,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn ||
-		     timer->data != (unsigned long)dwork);
+	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
-- 
cgit v1.3-14-g43fede


From 8c20feb60604d91a29cd7fef8ac758bd92d9fd2c Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 4 Oct 2017 16:27:07 -0700
Subject: workqueue: Convert callback to use from_timer()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch workqueue to use from_timer() and pass the
timer pointer explicitly.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Petr Mladek <pmladek@suse.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Sebastian Reichel <sre@kernel.org>
Cc: Kalle Valo <kvalo@qca.qualcomm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: linux1394-devel@lists.sourceforge.net
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: linux-s390@vger.kernel.org
Cc: linux-wireless@vger.kernel.org
Cc: "James E.J. Bottomley" <jejb@linux.vnet.ibm.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Ursula Braun <ubraun@linux.vnet.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Harish Patil <harish.patil@cavium.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Manish Chopra <manish.chopra@cavium.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: linux-pm@vger.kernel.org
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Julian Wiedmann <jwi@linux.vnet.ibm.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Mark Gross <mark.gross@intel.com>
Cc: linux-watchdog@vger.kernel.org
Cc: linux-scsi@vger.kernel.org
Cc: "Martin K. Petersen" <martin.petersen@oracle.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stefan Richter <stefanr@s5r6.in-berlin.de>
Cc: Michael Reed <mdr@sgi.com>
Cc: netdev@vger.kernel.org
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Link: https://lkml.kernel.org/r/1507159627-127660-14-git-send-email-keescook@chromium.org
---
 include/linux/workqueue.h | 15 ++++++++-------
 kernel/workqueue.c        |  7 +++----
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index f4960260feaf..f3c47a05fd06 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -17,7 +17,7 @@ struct workqueue_struct;
 
 struct work_struct;
 typedef void (*work_func_t)(struct work_struct *work);
-void delayed_work_timer_fn(unsigned long __data);
+void delayed_work_timer_fn(struct timer_list *t);
 
 /*
  * The first word is the work queue pointer and the flags rolled into
@@ -175,8 +175,8 @@ struct execute_work {
 
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
-	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,		\
-				     (unsigned long)&(n),		\
+	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+				     (TIMER_DATA_TYPE)&(n.timer),	\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
@@ -241,8 +241,9 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #define __INIT_DELAYED_WORK(_work, _func, _tflags)			\
 	do {								\
 		INIT_WORK(&(_work)->work, (_func));			\
-		__setup_timer(&(_work)->timer, delayed_work_timer_fn,	\
-			      (unsigned long)(_work),			\
+		__setup_timer(&(_work)->timer,				\
+			      (TIMER_FUNC_TYPE)delayed_work_timer_fn,	\
+			      (TIMER_DATA_TYPE)&(_work)->timer,		\
 			      (_tflags) | TIMER_IRQSAFE);		\
 	} while (0)
 
@@ -250,8 +251,8 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 	do {								\
 		INIT_WORK_ONSTACK(&(_work)->work, (_func));		\
 		__setup_timer_on_stack(&(_work)->timer,			\
-				       delayed_work_timer_fn,		\
-				       (unsigned long)(_work),		\
+				       (TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+				       (TIMER_DATA_TYPE)&(_work)->timer,\
 				       (_tflags) | TIMER_IRQSAFE);	\
 	} while (0)
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a5361fc6215d..c77fdf6bf24f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1492,9 +1492,9 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL(queue_work_on);
 
-void delayed_work_timer_fn(unsigned long __data)
+void delayed_work_timer_fn(struct timer_list *t)
 {
-	struct delayed_work *dwork = (struct delayed_work *)__data;
+	struct delayed_work *dwork = from_timer(dwork, t, timer);
 
 	/* should have been called from irqsafe timer with irq already off */
 	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
@@ -1508,8 +1508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
-		     timer->data != (unsigned long)dwork);
+	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
-- 
cgit v1.3-14-g43fede


From 3e234289f86b12985ef8909cd34525fcb66c4efb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 3 Mar 2017 18:00:22 -0500
Subject: ftrace: Allow module init functions to be traced

Allow for module init sections to be traced as well as core kernel init
sections. Now that filtering modules functions can be stored, for when they
are loaded, it makes sense to be able to trace them.

Cc: Jessica Yu <jeyu@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/init.h  | 4 +---
 kernel/module.c       | 2 ++
 kernel/trace/ftrace.c | 6 ++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init.h b/include/linux/init.h
index 94769d687cf0..a779c1816437 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -39,7 +39,7 @@
 
 /* These are for everybody (although not all archs will actually
    discard it in modules) */
-#define __init		__section(.init.text) __cold __inittrace __latent_entropy
+#define __init		__section(.init.text) __cold  __latent_entropy
 #define __initdata	__section(.init.data)
 #define __initconst	__section(.init.rodata)
 #define __exitdata	__section(.exit.data)
@@ -68,10 +68,8 @@
 
 #ifdef MODULE
 #define __exitused
-#define __inittrace notrace
 #else
 #define __exitused  __used
-#define __inittrace
 #endif
 
 #define __exit          __section(.exit.text) __exitused __cold notrace
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..58bca427ac3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,6 +3473,8 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
+	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 84cb5928665a..d7297e866e4a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5752,7 +5752,8 @@ void ftrace_release_mod(struct module *mod)
 	last_pg = &ftrace_pages_start;
 	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
 		rec = &pg->records[0];
-		if (within_module_core(rec->ip, mod)) {
+		if (within_module_core(rec->ip, mod) ||
+		    within_module_init(rec->ip, mod)) {
 			/*
 			 * As core pages are first, the first
 			 * page should never be a module page.
@@ -5821,7 +5822,8 @@ void ftrace_module_enable(struct module *mod)
 		 * not part of this module, then skip this pg,
 		 * which the "break" will do.
 		 */
-		if (!within_module_core(rec->ip, mod))
+		if (!within_module_core(rec->ip, mod) &&
+		    !within_module_init(rec->ip, mod))
 			break;
 
 		cnt = 0;
-- 
cgit v1.3-14-g43fede


From aba4b5c22cbac296f4081a0476d0c55828f135b4 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Fri, 1 Sep 2017 08:35:38 -0400
Subject: ftrace: Save module init functions kallsyms symbols for tracing

If function tracing is active when the module init functions are freed, then
store them to be referenced by kallsyms. As module init functions can now be
traced on module load, they were useless:

 ># echo ':mod:snd_seq' > set_ftrace_filter
 ># echo function > current_tracer
 ># modprobe snd_seq
 ># cat trace
 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2786  [000] ....  3189.037874: 0xffffffffa0860000 <-do_one_initcall
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086004d <-0xffffffffa086000f
         modprobe-2786  [000] ....  3189.037876: 0xffffffffa086010d <-0xffffffffa0860018
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa086011a <-0xffffffffa0860021
         modprobe-2786  [000] ....  3189.037877: 0xffffffffa0860080 <-0xffffffffa086002a
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa0860400 <-0xffffffffa0860033
         modprobe-2786  [000] ....  3189.039523: 0xffffffffa086038a <-0xffffffffa086041c
         modprobe-2786  [000] ....  3189.039591: 0xffffffffa086038a <-0xffffffffa0860436
         modprobe-2786  [000] ....  3189.039657: 0xffffffffa086038a <-0xffffffffa0860450
         modprobe-2786  [000] ....  3189.039719: 0xffffffffa0860127 <-0xffffffffa086003c
         modprobe-2786  [000] ....  3189.039742: snd_seq_create_kernel_client <-0xffffffffa08601f6

When the output is shown, the kallsyms for the module init functions have
already been freed, and the output of the trace can not convert them to
their function names.

Now this looks like this:

 # tracer: function
 #
 #                              _-----=> irqs-off
 #                             / _----=> need-resched
 #                            | / _---=> hardirq/softirq
 #                            || / _--=> preempt-depth
 #                            ||| /     delay
 #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
 #              | |       |   ||||       |         |
         modprobe-2463  [002] ....   174.243237: alsa_seq_init <-do_one_initcall
         modprobe-2463  [002] ....   174.243239: client_init_data <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_memory_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_seq_queues_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.243240: snd_sequencer_device_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244860: snd_seq_info_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.244861: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.244936: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245003: create_info_entry <-snd_seq_info_init
         modprobe-2463  [002] ....   174.245072: snd_seq_system_client_init <-alsa_seq_init
         modprobe-2463  [002] ....   174.245094: snd_seq_create_kernel_client <-snd_seq_system_client_init

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  20 ++++++-
 kernel/kallsyms.c      |   5 ++
 kernel/module.c        |   2 +-
 kernel/trace/ftrace.c  | 146 ++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 168 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 47fc404ad233..202b40784c4e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -51,6 +51,21 @@ static inline void early_trace_init(void) { }
 struct module;
 struct ftrace_hash;
 
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_MODULES) && \
+	defined(CONFIG_DYNAMIC_FTRACE)
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym);
+#else
+static inline const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	return NULL;
+}
+#endif
+
+
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -151,10 +166,10 @@ struct ftrace_ops_hash {
 };
 
 void ftrace_free_init_mem(void);
-void ftrace_free_mem(void *start, void *end);
+void ftrace_free_mem(struct module *mod, void *start, void *end);
 #else
 static inline void ftrace_free_init_mem(void) { }
-static inline void ftrace_free_mem(void *start, void *end) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif
 
 /*
@@ -272,6 +287,7 @@ static inline int ftrace_nr_registered_ops(void)
 static inline void clear_ftrace_function(void) { }
 static inline void ftrace_kill(void) { }
 static inline void ftrace_free_init_mem(void) { }
+static inline void ftrace_free_mem(struct module *mod, void *start, void *end) { }
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_STACK_TRACER
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..976ecb9275d9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/filter.h>
+#include <linux/ftrace.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
 	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
+
+	if (!ret)
+		ret = ftrace_mod_address_lookup(addr, symbolsize,
+						offset, modname, namebuf);
 	return ret;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index 58bca427ac3f..279a469dc375 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3473,7 +3473,7 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
-	ftrace_free_mem(mod->init_layout.base, mod->init_layout.base +
+	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
 			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7297e866e4a..86dbbfb353db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5675,6 +5675,21 @@ static int ftrace_process_locs(struct module *mod,
 	return ret;
 }
 
+struct ftrace_mod_func {
+	struct list_head	list;
+	char			*name;
+	unsigned long		ip;
+	unsigned int		size;
+};
+
+struct ftrace_mod_map {
+	struct list_head	list;
+	struct module		*mod;
+	unsigned long		start_addr;
+	unsigned long		end_addr;
+	struct list_head	funcs;
+};
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -5868,9 +5883,123 @@ void ftrace_module_init(struct module *mod)
 	ftrace_process_locs(mod, mod->ftrace_callsites,
 			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec)
+{
+	struct ftrace_mod_func *mod_func;
+	unsigned long symsize;
+	unsigned long offset;
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+	const char *ret;
+
+	ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+	if (!ret)
+		return;
+
+	mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+	if (!mod_func)
+		return;
+
+	mod_func->name = kstrdup(str, GFP_KERNEL);
+	if (!mod_func->name) {
+		kfree(mod_func);
+		return;
+	}
+
+	mod_func->ip = rec->ip - offset;
+	mod_func->size = symsize;
+
+	list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static LIST_HEAD(ftrace_mod_maps);
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	struct ftrace_mod_map *mod_map;
+
+	mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+	if (!mod_map)
+		return NULL;
+
+	mod_map->mod = mod;
+	mod_map->start_addr = start;
+	mod_map->end_addr = end;
+
+	INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+	list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+	return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+			   unsigned long addr, unsigned long *size,
+			   unsigned long *off, char *sym)
+{
+	struct ftrace_mod_func *found_func =  NULL;
+	struct ftrace_mod_func *mod_func;
+
+	list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+		if (addr >= mod_func->ip &&
+		    addr < mod_func->ip + mod_func->size) {
+			found_func = mod_func;
+			break;
+		}
+	}
+
+	if (found_func) {
+		if (size)
+			*size = found_func->size;
+		if (off)
+			*off = addr - found_func->ip;
+		if (sym)
+			strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+		return found_func->name;
+	}
+
+	return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	struct ftrace_mod_map *mod_map;
+	const char *ret = NULL;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+		if (ret) {
+			if (modname)
+				*modname = mod_map->mod->name;
+			break;
+		}
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	return NULL;
+}
 #endif /* CONFIG_MODULES */
 
-void ftrace_free_mem(void *start_ptr, void *end_ptr)
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
 	unsigned long start = (unsigned long)(start_ptr);
 	unsigned long end = (unsigned long)(end_ptr);
@@ -5878,6 +6007,7 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
+	struct ftrace_mod_map *mod_map = NULL;
 	int order;
 
 	key.ip = start;
@@ -5885,6 +6015,14 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 
 	mutex_lock(&ftrace_lock);
 
+	/*
+	 * If we are freeing module init memory, then check if
+	 * any tracer is active. If so, we need to save a mapping of
+	 * the module functions being freed with the address.
+	 */
+	if (mod && ftrace_ops_list != &ftrace_list_end)
+		mod_map = allocate_ftrace_mod_map(mod, start, end);
+
 	for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
 		if (end < pg->records[0].ip ||
 		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5895,6 +6033,10 @@ void ftrace_free_mem(void *start_ptr, void *end_ptr)
 			      ftrace_cmp_recs);
 		if (!rec)
 			continue;
+
+		if (mod_map)
+			save_ftrace_mod_rec(mod_map, rec);
+
 		pg->index--;
 		ftrace_update_tot_cnt--;
 		if (!pg->index) {
@@ -5920,7 +6062,7 @@ void __init ftrace_free_init_mem(void)
 	void *start = (void *)(&__init_begin);
 	void *end = (void *)(&__init_end);
 
-	ftrace_free_mem(start, end);
+	ftrace_free_mem(NULL, start, end);
 }
 
 void __init ftrace_init(void)
-- 
cgit v1.3-14-g43fede


From 6aa69784b43eb5f69120339938c50a97a433049f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Tue, 5 Sep 2017 19:20:16 -0400
Subject: ftrace: Add freeing algorithm to free ftrace_mod_maps

The ftrace_mod_map is a descriptor to save module init function names in
case they were traced, and the trace output needs to reference the function
name from the function address. But after the function is unloaded, it
the maps should be freed, as the rest of the function names are as well.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 86dbbfb353db..a5824408bed9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5683,6 +5683,7 @@ struct ftrace_mod_func {
 };
 
 struct ftrace_mod_map {
+	struct rcu_head		rcu;
 	struct list_head	list;
 	struct module		*mod;
 	unsigned long		start_addr;
@@ -5694,6 +5695,8 @@ struct ftrace_mod_map {
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 
+static LIST_HEAD(ftrace_mod_maps);
+
 static int referenced_filters(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *ops;
@@ -5747,8 +5750,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
 	mutex_unlock(&trace_types_lock);
 }
 
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+	struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+	struct ftrace_mod_func *mod_func;
+	struct ftrace_mod_func *n;
+
+	/* All the contents of mod_map are now not visible to readers */
+	list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+		kfree(mod_func->name);
+		list_del(&mod_func->list);
+		kfree(mod_func);
+	}
+
+	kfree(mod_map);
+}
+
 void ftrace_release_mod(struct module *mod)
 {
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_map *n;
 	struct dyn_ftrace *rec;
 	struct ftrace_page **last_pg;
 	struct ftrace_page *tmp_page = NULL;
@@ -5760,6 +5781,14 @@ void ftrace_release_mod(struct module *mod)
 	if (ftrace_disabled)
 		goto out_unlock;
 
+	list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+		if (mod_map->mod == mod) {
+			list_del_rcu(&mod_map->list);
+			call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+			break;
+		}
+	}
+
 	/*
 	 * Each module has its own ftrace_pages, remove
 	 * them from the list.
@@ -5914,8 +5943,6 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 	list_add_rcu(&mod_func->list, &mod_map->funcs);
 }
 
-static LIST_HEAD(ftrace_mod_maps);
-
 static struct ftrace_mod_map *
 allocate_ftrace_mod_map(struct module *mod,
 			unsigned long start, unsigned long end)
@@ -5974,6 +6001,7 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	struct ftrace_mod_map *mod_map;
 	const char *ret = NULL;
 
+	/* mod_map is freed via call_rcu_sched() */
 	preempt_disable();
 	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
 		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
-- 
cgit v1.3-14-g43fede


From 6171a0310a06a7a0cb83713fa7068bdd4192de19 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 6 Sep 2017 08:40:41 -0400
Subject: ftrace/kallsyms: Have /proc/kallsyms show saved mod init functions

If a module is loaded while tracing is enabled, then there's a possibility
that the module init functions were traced. These functions have their name
and address stored by ftrace such that it can translate the function address
that is written into the buffer into a human readable function name.

As userspace tools may be doing the same, they need a way to map function
names to their address as well. This is done through reading /proc/kallsyms.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  9 +++++++++
 kernel/kallsyms.c      | 38 ++++++++++++++++++++++++++++++++------
 kernel/trace/ftrace.c  | 40 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 202b40784c4e..346f8294e40a 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,6 +56,9 @@ struct ftrace_hash;
 const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym);
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported);
 #else
 static inline const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -63,6 +66,12 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 {
 	return NULL;
 }
+static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+					 char *type, char *name,
+					 char *module_name, int *exported)
+{
+	return -1;
+}
 #endif
 
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 976ecb9275d9..1966fe1c2b57 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -479,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 struct kallsym_iter {
 	loff_t pos;
 	loff_t pos_mod_end;
+	loff_t pos_ftrace_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -501,11 +502,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+					 &iter->value, &iter->type,
+					 iter->name, iter->module_name,
+					 &iter->exported);
+	if (ret < 0) {
+		iter->pos_ftrace_mod_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
 	iter->module_name[0] = '\0';
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
 			       &iter->value, &iter->type,
 			       iter->name) < 0 ? 0 : 1;
 }
@@ -530,20 +545,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
-	if (new_pos == 0)
+	if (new_pos == 0) {
 		iter->pos_mod_end = 0;
+		iter->pos_ftrace_mod_end = 0;
+	}
 }
 
 static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 {
 	iter->pos = pos;
 
-	if (iter->pos_mod_end > 0 &&
-	    iter->pos_mod_end < iter->pos)
+	if (iter->pos_ftrace_mod_end > 0 &&
+	    iter->pos_ftrace_mod_end < iter->pos)
 		return get_ksymbol_bpf(iter);
 
-	if (!get_ksymbol_mod(iter))
-		return get_ksymbol_bpf(iter);
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+		return 1;
+	}
+
+	if (!get_ksymbol_mod(iter)) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+	}
 
 	return 1;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a5824408bed9..9e99bd55732e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5689,6 +5689,7 @@ struct ftrace_mod_map {
 	unsigned long		start_addr;
 	unsigned long		end_addr;
 	struct list_head	funcs;
+	unsigned int		num_funcs;
 };
 
 #ifdef CONFIG_MODULES
@@ -5940,6 +5941,8 @@ static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 	mod_func->ip = rec->ip - offset;
 	mod_func->size = symsize;
 
+	mod_map->num_funcs++;
+
 	list_add_rcu(&mod_func->list, &mod_map->funcs);
 }
 
@@ -5956,6 +5959,7 @@ allocate_ftrace_mod_map(struct module *mod,
 	mod_map->mod = mod;
 	mod_map->start_addr = start;
 	mod_map->end_addr = end;
+	mod_map->num_funcs = 0;
 
 	INIT_LIST_HEAD_RCU(&mod_map->funcs);
 
@@ -6016,6 +6020,42 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 	return ret;
 }
 
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported)
+{
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_func *mod_func;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+		if (symnum >= mod_map->num_funcs) {
+			symnum -= mod_map->num_funcs;
+			continue;
+		}
+
+		list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+			if (symnum > 1) {
+				symnum--;
+				continue;
+			}
+
+			*value = mod_func->ip;
+			*type = 'T';
+			strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+			strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+			*exported = 1;
+			preempt_enable();
+			return 0;
+		}
+		WARN_ON(1);
+		break;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
 #else
 static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
 				struct dyn_ftrace *rec) { }
-- 
cgit v1.3-14-g43fede


From aaecaa0b5f31794f1711247da4b5883a6ff98163 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Thu, 5 Oct 2017 17:54:31 -0700
Subject: tracing: Prepare to add preempt and irq trace events

In preparation of adding irqsoff and preemptsoff enable and disable trace
events, move required functions and code to make it easier to add these events
in a later patch. This patch is just code movement and no functional change.
Link: http://lkml.kernel.org/r/20171006005432.14244-2-joelaf@google.com

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 74 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..0e3033c00474 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,7 @@
 
 #include "trace.h"
 
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
@@ -462,64 +463,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 
 #else /* !CONFIG_PROVE_LOCKING */
 
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
 /*
  * We are only interested in hardirq on/off events:
  */
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_on);
 
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_off);
 
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
 #endif /*  CONFIG_IRQSOFF_TRACER */
 
 #ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
@@ -781,3 +762,70 @@ __init static int init_irqsoff_tracer(void)
 	return 0;
 }
 core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+	tracer_hardirqs_on();
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_on_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	tracer_preempt_off(a0, a1);
+}
+#endif
-- 
cgit v1.3-14-g43fede


From 1bd845bcb41d5b7f83745e0cb99273eb376f2ec5 Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:09 +0200
Subject: padata: set cpu_index of unused CPUs to -1

The parallel queue per-cpu data structure gets initialized only for CPUs
in the 'pcpu' CPU mask set. This is not sufficient as the reorder timer
may run on a different CPU and might wrongly decide it's the target CPU
for the next reorder item as per-cpu memory gets memset(0) and we might
be waiting for the first CPU in cpumask.pcpu, i.e. cpu_index 0.

Make the '__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index'
compare in padata_get_next() fail in this case by initializing the
cpu_index member of all per-cpu parallel queues. Use -1 for unused ones.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 kernel/padata.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..1b9b4bac4a9b 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -384,8 +384,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
 	struct padata_parallel_queue *pqueue;
 
 	cpu_index = 0;
-	for_each_cpu(cpu, pd->cpumask.pcpu) {
+	for_each_possible_cpu(cpu) {
 		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+		if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+			pqueue->cpu_index = -1;
+			continue;
+		}
+
 		pqueue->pd = pd;
 		pqueue->cpu_index = cpu_index;
 		cpu_index++;
-- 
cgit v1.3-14-g43fede


From cf5868c8a22dc2854b96e9569064bb92365549ca Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:10 +0200
Subject: padata: ensure the reorder timer callback runs on the correct CPU

The reorder timer function runs on the CPU where the timer interrupt was
handled which is not necessarily one of the CPUs of the 'pcpu' CPU mask
set.

Ensure the padata_reorder() callback runs on the correct CPU, which is
one in the 'pcpu' CPU mask set and, preferrably, the next expected one.
Do so by comparing the current CPU with the expected target CPU. If they
match, call padata_reorder() right away. If they differ, schedule a work
item on the target CPU that does the padata_reorder() call for us.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 43 ++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 2f9c1f93b1ce..5c0175bbc179 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -85,6 +85,7 @@ struct padata_serial_queue {
  * @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
  * @work: work struct for parallelization.
+ * @reorder_work: work struct for reordering.
  * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
@@ -93,6 +94,7 @@ struct padata_parallel_queue {
        struct padata_list    reorder;
        struct parallel_data *pd;
        struct work_struct    work;
+       struct work_struct    reorder_work;
        atomic_t              num_obj;
        int                   cpu_index;
 };
diff --git a/kernel/padata.c b/kernel/padata.c
index 1b9b4bac4a9b..b4066147bce4 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -275,11 +275,51 @@ static void padata_reorder(struct parallel_data *pd)
 	return;
 }
 
+static void invoke_padata_reorder(struct work_struct *work)
+{
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
+
+	local_bh_disable();
+	pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+	pd = pqueue->pd;
+	padata_reorder(pd);
+	local_bh_enable();
+}
+
 static void padata_reorder_timer(unsigned long arg)
 {
 	struct parallel_data *pd = (struct parallel_data *)arg;
+	unsigned int weight;
+	int target_cpu, cpu;
 
-	padata_reorder(pd);
+	cpu = get_cpu();
+
+	/* We don't lock pd here to not interfere with parallel processing
+	 * padata_reorder() calls on other CPUs. We just need any CPU out of
+	 * the cpumask.pcpu set. It would be nice if it's the right one but
+	 * it doesn't matter if we're off to the next one by using an outdated
+	 * pd->processed value.
+	 */
+	weight = cpumask_weight(pd->cpumask.pcpu);
+	target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+	/* ensure to call the reorder callback on the correct CPU */
+	if (cpu != target_cpu) {
+		struct padata_parallel_queue *pqueue;
+		struct padata_instance *pinst;
+
+		/* The timer function is serialized wrt itself -- no locking
+		 * needed.
+		 */
+		pinst = pd->pinst;
+		pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+		queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+	} else {
+		padata_reorder(pd);
+	}
+
+	put_cpu();
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
@@ -399,6 +439,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
 		__padata_list_init(&pqueue->reorder);
 		__padata_list_init(&pqueue->parallel);
 		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
 		atomic_set(&pqueue->num_obj, 0);
 	}
 }
-- 
cgit v1.3-14-g43fede


From 350ef88e7e922354f82a931897ad4a4ce6c686ff Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Fri, 8 Sep 2017 20:57:11 +0200
Subject: padata: ensure padata_do_serial() runs on the correct CPU

If the algorithm we're parallelizing is asynchronous we might change
CPUs between padata_do_parallel() and padata_do_serial(). However, we
don't expect this to happen as we need to enqueue the padata object into
the per-cpu reorder queue we took it from, i.e. the same-cpu's parallel
queue.

Ensure we're not switching CPUs for a given padata object by tracking
the CPU within the padata object. If the serial callback gets called on
the wrong CPU, defer invoking padata_reorder() via a kernel worker on
the CPU we're expected to run on.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/linux/padata.h |  2 ++
 kernel/padata.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 5c0175bbc179..5d13d25da2c8 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -37,6 +37,7 @@
  * @list: List entry, to attach to the padata lists.
  * @pd: Pointer to the internal control structure.
  * @cb_cpu: Callback cpu for serializatioon.
+ * @cpu: Cpu for parallelization.
  * @seq_nr: Sequence number of the parallelized data object.
  * @info: Used to pass information from the parallel to the serial function.
  * @parallel: Parallel execution function.
@@ -46,6 +47,7 @@ struct padata_priv {
 	struct list_head	list;
 	struct parallel_data	*pd;
 	int			cb_cpu;
+	int			cpu;
 	int			info;
 	void                    (*parallel)(struct padata_priv *padata);
 	void                    (*serial)(struct padata_priv *padata);
diff --git a/kernel/padata.c b/kernel/padata.c
index b4066147bce4..f262c9a4e70a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->cb_cpu = cb_cpu;
 
 	target_cpu = padata_cpu_hash(pd);
+	padata->cpu = target_cpu;
 	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
@@ -363,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
 	int cpu;
 	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
+	int reorder_via_wq = 0;
 
 	pd = padata->pd;
 
 	cpu = get_cpu();
+
+	/* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+	 * was called on -- or, at least, enqueue the padata object into the
+	 * correct per-cpu queue.
+	 */
+	if (cpu != padata->cpu) {
+		reorder_via_wq = 1;
+		cpu = padata->cpu;
+	}
+
 	pqueue = per_cpu_ptr(pd->pqueue, cpu);
 
 	spin_lock(&pqueue->reorder.lock);
@@ -376,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
 
 	put_cpu();
 
-	padata_reorder(pd);
+	/* If we're running on the wrong CPU, call padata_reorder() via a
+	 * kernel worker.
+	 */
+	if (reorder_via_wq)
+		queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+	else
+		padata_reorder(pd);
 }
 EXPORT_SYMBOL(padata_do_serial);
 
-- 
cgit v1.3-14-g43fede


From 97562633bcbac4a07d605ae628d7655fa71caaf5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: bpf: perf event change needed for subsequent bpf helpers

This patch does not impact existing functionalities.
It contains the changes in perf event area needed for
subsequent bpf_perf_event_read_value and
bpf_perf_prog_read_value helpers.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 98c0f00c3f5e..68d866628be0 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..902149f05381 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.3-14-g43fede


From 908432ca84fc229e906ba164219e9ad0fe56f755 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:20 -0700
Subject: bpf: add helper bpf_perf_event_read_value for perf event array map

Hardware pmu counters are limited resources. When there are more
pmu based perf events opened than available counters, kernel will
multiplex these events so each event gets certain percentage
(but not 100%) of the pmu time. In case that multiplexing happens,
the number of samples or counter value will not reflect the
case compared to no multiplexing. This makes comparison between
different runs difficult.

Typically, the number of samples or counter value should be
normalized before comparing to other experiments. The typical
normalization is done like:
  normalized_num_samples = num_samples * time_enabled / time_running
  normalized_counter_value = counter_value * time_enabled / time_running
where time_enabled is the time enabled for event and time_running is
the time running for event since last normalization.

This patch adds helper bpf_perf_event_read_value for kprobed based perf
event array map, to read perf counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.
To achieve scaling factor between two bpf invocations, users
can can use cpu_id as the key (which is typical for perf array usage model)
to remember the previous value and do the calculation inside the
bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 21 +++++++++++++++++++--
 kernel/bpf/verifier.c    |  4 +++-
 kernel/trace/bpf_trace.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 63 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6082faf5fd2a..7b57a212c7d7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -641,6 +641,14 @@ union bpf_attr {
  *     @xdp_md: pointer to xdp_md
  *     @delta: An positive/negative integer to be added to xdp_md.data_meta
  *     Return: 0 on success or negative on error
+ *
+ * int bpf_perf_event_read_value(map, flags, buf, buf_size)
+ *     read perf event counter value and perf event enabled/running time
+ *     @map: pointer to perf_event_array map
+ *     @flags: index of event in the map or bitmask flags
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return: 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -697,7 +705,8 @@ union bpf_attr {
 	FN(redirect_map),		\
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
-	FN(xdp_adjust_meta),
+	FN(xdp_adjust_meta),		\
+	FN(perf_event_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -741,7 +750,9 @@ enum bpf_func_id {
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
 #define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
-/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */
+/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and
+ * BPF_FUNC_perf_event_read_value flags.
+ */
 #define BPF_F_INDEX_MASK		0xffffffffULL
 #define BPF_F_CURRENT_CPU		BPF_F_INDEX_MASK
 /* BPF_FUNC_perf_event_output for sk_buff input context. */
@@ -934,4 +945,10 @@ enum {
 #define TCP_BPF_IW		1001	/* Set TCP initial congestion window */
 #define TCP_BPF_SNDCWND_CLAMP	1002	/* Set sndcwnd_clamp */
 
+struct bpf_perf_event_value {
+	__u64 counter;
+	__u64 enabled;
+	__u64 running;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 52b022310f6a..590125e29161 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1552,7 +1552,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
-		    func_id != BPF_FUNC_perf_event_output)
+		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -1595,6 +1596,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_perf_event_read_value:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..0be86cc0130e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+		     u64 *value, u64 *enabled, u64 *running)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	u64 value = 0;
-	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value, NULL, NULL);
+	return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+	u64 value = 0;
+	int err;
+
+	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
@@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+				   &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+	.func		= bpf_perf_event_read_value,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_perf_event_read_value:
+		return &bpf_perf_event_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.3-14-g43fede


From 4bebdc7a85aa400c0222b5329861e4ad9252f1e5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:22 -0700
Subject: bpf: add helper bpf_perf_prog_read_value

This patch adds helper bpf_perf_prog_read_cvalue for perf event based bpf
programs, to read event counter and enabled/running time.
The enabled/running time is accumulated since the perf event open.

The typical use case for perf event based bpf program is to attach itself
to a single event. In such cases, if it is desirable to get scaling factor
between two bpf invocations, users can can save the time values in a map,
and use the value from the map and the current value to calculate
the scaling factor.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 10 +++++++++-
 kernel/trace/bpf_trace.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 7b57a212c7d7..5bbbec17aa5a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -649,6 +649,13 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return: 0 on success or negative error code
+ *
+ * int bpf_perf_prog_read_value(ctx, buf, buf_size)
+ *     read perf prog attached perf event counter and enabled/running time
+ *     @ctx: pointer to ctx
+ *     @buf: buf to fill
+ *     @buf_size: size of the buf
+ *     Return : 0 on success or negative error code
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -706,7 +713,8 @@ union bpf_attr {
 	FN(sk_redirect_map),		\
 	FN(sock_map_update),		\
 	FN(xdp_adjust_meta),		\
-	FN(perf_event_read_value),
+	FN(perf_event_read_value),	\
+	FN(perf_prog_read_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0be86cc0130e..04ea5314f2bc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -613,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+				    &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -620,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_perf_prog_read_value:
+		return &bpf_perf_prog_read_value_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
-- 
cgit v1.3-14-g43fede


From 473d97343f94ff20f5196078314e4dd83156d3a2 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:11 -0700
Subject: bpf: Change bpf_obj_name_cpy() to better ensure map's name is init by
 0

During get_info_by_fd, the prog/map name is memcpy-ed.  It depends
on the prog->aux->name and map->name to be zero initialized.

bpf_prog_aux is easy to guarantee that aux->name is zero init.

The name in bpf_map may be harder to be guaranteed in the future when
new map type is added.

Hence, this patch makes bpf_obj_name_cpy() to always zero init
the prog/map name.

Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0048cb24ba7b..d124e702e040 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 {
 	const char *end = src + BPF_OBJ_NAME_LEN;
 
+	memset(dst, 0, BPF_OBJ_NAME_LEN);
+
 	/* Copy all isalnum() and '_' char */
 	while (src < end && *src) {
 		if (!isalnum(*src) && *src != '_')
@@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
 	if (src == end)
 		return -EINVAL;
 
-	/* '\0' terminates dst */
-	*dst = 0;
-
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 368211fb920a0b789c238942c6af0414539f79d6 Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Thu, 5 Oct 2017 21:52:13 -0700
Subject: bpf: Append prog->aux->name in bpf_get_prog_name()

This patch makes the bpf_prog's name available
in kallsyms.

The new format is bpf_prog_tag[_name].

Sample kallsyms from running selftests/bpf/test_progs:
[root@arch-fb-vm1 ~]# egrep ' bpf_prog_[0-9a-fA-F]{16}' /proc/kallsyms
ffffffffa0048000 t bpf_prog_dabf0207d1992486_test_obj_id
ffffffffa0038000 t bpf_prog_a04f5eef06a7f555__123456789ABCDE
ffffffffa0050000 t bpf_prog_a04f5eef06a7f555

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c6be15ae83ee..248961af2421 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
+	const char *end = sym + KSYM_NAME_LEN;
+
 	BUILD_BUG_ON(sizeof("bpf_prog_") +
-		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+		     sizeof(prog->tag) * 2 +
+		     /* name has been null terminated.
+		      * We should need +1 for the '_' preceding
+		      * the name.  However, the null character
+		      * is double counted between the name and the
+		      * sizeof("bpf_prog_") above, so we omit
+		      * the +1 here.
+		      */
+		     sizeof(prog->aux->name) > KSYM_NAME_LEN);
 
 	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
-	*sym = 0;
+	if (prog->aux->name[0])
+		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+	else
+		*sym = 0;
 }
 
 static __always_inline unsigned long
-- 
cgit v1.3-14-g43fede


From 135bd1a230bb69a68c9808a7d25467318900b80a Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay <neeraju@codeaurora.org>
Date: Mon, 7 Aug 2017 11:20:10 +0530
Subject: rcu: Fix up pending cbs check in rcu_prepare_for_idle

The pending-callbacks check in rcu_prepare_for_idle() is backwards.
It should accelerate if there are pending callbacks, but the check
rather uselessly accelerates only if there are no callbacks.  This commit
therefore inverts this check.

Fixes: 15fecf89e46a ("srcu: Abstract multi-tail callback list handling")
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.12.x
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..fed95fa941e6 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1507,7 +1507,7 @@ static void rcu_prepare_for_idle(void)
 	rdtp->last_accelerate = jiffies;
 	for_each_rcu_flavor(rsp) {
 		rdp = this_cpu_ptr(rsp->rda);
-		if (rcu_segcblist_pend_cbs(&rdp->cblist))
+		if (!rcu_segcblist_pend_cbs(&rdp->cblist))
 			continue;
 		rnp = rdp->mynode;
 		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
-- 
cgit v1.3-14-g43fede


From c63eb17ff06dbcf73e771b9b425c531cc0a9c17b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 11 Aug 2017 12:37:07 -0700
Subject: rcu: Create call_rcu_tasks() kthread at boot time

Currently the call_rcu_tasks() kthread is created upon first
invocation of call_rcu_tasks().  This has the advantage of avoiding
creation if there are never any invocations of call_rcu_tasks() and of
synchronize_rcu_tasks(), but it requires an unreliable heuristic to
determine when it is safe to create the kthread.  For example, it is
not safe to create the kthread when call_rcu_tasks() is invoked with
a spinlock held, but there is no good way to detect this in !PREEMPT
kernels.

This commit therefore creates this kthread unconditionally at
core_initcall() time.  If you don't want this kthread created, then
build with CONFIG_TASKS_RCU=n.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/update.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..e9bbedbb8745 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -575,7 +575,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
 static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
 module_param(rcu_task_stall_timeout, int, 0644);
 
-static void rcu_spawn_tasks_kthread(void);
 static struct task_struct *rcu_tasks_kthread_ptr;
 
 /**
@@ -600,7 +599,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 {
 	unsigned long flags;
 	bool needwake;
-	bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
 
 	rhp->next = NULL;
 	rhp->func = func;
@@ -610,11 +608,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
 	rcu_tasks_cbs_tail = &rhp->next;
 	raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
 	/* We can't create the thread unless interrupts are enabled. */
-	if ((needwake && havetask) ||
-	    (!havetask && !irqs_disabled_flags(flags))) {
-		rcu_spawn_tasks_kthread();
+	if (needwake && READ_ONCE(rcu_tasks_kthread_ptr))
 		wake_up(&rcu_tasks_cbs_wq);
-	}
 }
 EXPORT_SYMBOL_GPL(call_rcu_tasks);
 
@@ -853,27 +848,18 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	}
 }
 
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
+/* Spawn rcu_tasks_kthread() at core_initcall() time. */
+static int __init rcu_spawn_tasks_kthread(void)
 {
-	static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
 	struct task_struct *t;
 
-	if (READ_ONCE(rcu_tasks_kthread_ptr)) {
-		smp_mb(); /* Ensure caller sees full kthread. */
-		return;
-	}
-	mutex_lock(&rcu_tasks_kthread_mutex);
-	if (rcu_tasks_kthread_ptr) {
-		mutex_unlock(&rcu_tasks_kthread_mutex);
-		return;
-	}
 	t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
 	BUG_ON(IS_ERR(t));
 	smp_mb(); /* Ensure others see full kthread. */
 	WRITE_ONCE(rcu_tasks_kthread_ptr, t);
-	mutex_unlock(&rcu_tasks_kthread_mutex);
+	return 0;
 }
+core_initcall(rcu_spawn_tasks_kthread);
 
 /* Do the srcu_read_lock() for the above synchronize_srcu().  */
 void exit_tasks_rcu_start(void)
-- 
cgit v1.3-14-g43fede


From 6733bab7bc09b67668028dab562caea1b4ff3c69 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 18 Aug 2017 10:59:16 -0700
Subject: irq_work: Map irq_work_on_queue() to irq_work_on() in !SMP

Commit 478850160636 ("irq_work: Implement remote queueing") provides
irq_work_on_queue() only for SMP builds.  However, providing it simplifies
code that submits irq_work to lists of CPUs, eliminating the !SMP special
cases.  This commit therefore maps irq_work_on_queue() to irq_work_on()
in !SMP builds, but validating the specified CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/irq_work.h | 3 ---
 kernel/irq_work.c        | 9 ++++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 47b9ebd4a74f..d8c9876d5da4 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -33,10 +33,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
 
 bool irq_work_queue(struct irq_work *work);
-
-#ifdef CONFIG_SMP
 bool irq_work_queue_on(struct irq_work *work, int cpu);
-#endif
 
 void irq_work_tick(void);
 void irq_work_sync(struct irq_work *work);
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..9f20f6c72579 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
 	 */
 }
 
-#ifdef CONFIG_SMP
 /*
  * Enqueue the irq_work @work on @cpu unless it's already pending
  * somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	/* All work should have been flushed before going offline */
 	WARN_ON_ONCE(cpu_is_offline(cpu));
 
+#ifdef CONFIG_SMP
+
 	/* Arch remote IPI send/receive backend aren't NMI safe */
 	WARN_ON_ONCE(in_nmi());
 
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
 	if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
 		arch_send_call_function_single_ipi(cpu);
 
+#else /* #ifdef CONFIG_SMP */
+	irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
 	return true;
 }
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
 
 /* Enqueue the irq work @work on the current CPU */
 bool irq_work_queue(struct irq_work *work)
-- 
cgit v1.3-14-g43fede


From 7c2102e56a3f7d85b5d8f33efbd7aecc1f36fdd8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 18 Sep 2017 08:54:40 -0700
Subject: sched: Make resched_cpu() unconditional

The current implementation of synchronize_sched_expedited() incorrectly
assumes that resched_cpu() is unconditional, which it is not.  This means
that synchronize_sched_expedited() can hang when resched_cpu()'s trylock
fails as follows (analysis by Neeraj Upadhyay):

o	CPU1 is waiting for expedited wait to complete:

	sync_rcu_exp_select_cpus
	     rdp->exp_dynticks_snap & 0x1   // returns 1 for CPU5
	     IPI sent to CPU5

	synchronize_sched_expedited_wait
		 ret = swait_event_timeout(rsp->expedited_wq,
					   sync_rcu_preempt_exp_done(rnp_root),
					   jiffies_stall);

	expmask = 0x20, CPU 5 in idle path (in cpuidle_enter())

o	CPU5 handles IPI and fails to acquire rq lock.

	Handles IPI
	     sync_sched_exp_handler
		 resched_cpu
		     returns while failing to try lock acquire rq->lock
		 need_resched is not set

o	CPU5 calls  rcu_idle_enter() and as need_resched is not set, goes to
	idle (schedule() is not called).

o	CPU 1 reports RCU stall.

Given that resched_cpu() is now used only by RCU, this commit fixes the
assumption by making resched_cpu() unconditional.

Reported-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Suggested-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: stable@vger.kernel.org
---
 kernel/sched/core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..8fa7b6f9e19b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -505,8 +505,7 @@ void resched_cpu(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
-		return;
+	raw_spin_lock_irqsave(&rq->lock, flags);
 	resched_curr(rq);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-- 
cgit v1.3-14-g43fede


From f79c3ad6189624c3de0ad5521610c9e22a1c33cf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Nov 2016 06:24:30 -0800
Subject: sched,rcu: Make cond_resched() provide RCU quiescent state

There is some confusion as to which of cond_resched() or
cond_resched_rcu_qs() should be added to long in-kernel loops.
This commit therefore eliminates the decision by adding RCU quiescent
states to cond_resched().  This commit also simplifies the code that
used to interact with cond_resched_rcu_qs(), and that now interacts with
cond_resched(), to reduce its overhead.  This reduction is necessary to
allow the heavier-weight cond_resched_rcu_qs() mechanism to be invoked
everywhere that cond_resched() is invoked.

Part of that reduction in overhead converts the jiffies_till_sched_qs
kernel parameter to read-only at runtime, thus eliminating the need for
bounds checking.

Reported-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
[ paulmck: Keep PREEMPT=n cond_resched a no-op, per Peter Zijlstra. ]
---
 kernel/rcu/tree.c   | 25 +++++--------------------
 kernel/sched/core.c |  1 +
 2 files changed, 6 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..0dda57a28276 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
  * How long the grace period must be before we start recruiting
  * quiescent-state help from rcu_note_context_switch().
  */
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
+static ulong jiffies_till_sched_qs = HZ / 10;
+module_param(jiffies_till_sched_qs, ulong, 0444);
 
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 				  struct rcu_data *rdp);
@@ -1235,7 +1235,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	unsigned long jtsq;
 	bool *rnhqp;
 	bool *ruqp;
-	unsigned long rjtsc;
 	struct rcu_node *rnp;
 
 	/*
@@ -1252,23 +1251,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		return 1;
 	}
 
-	/* Compute and saturate jiffies_till_sched_qs. */
-	jtsq = jiffies_till_sched_qs;
-	rjtsc = rcu_jiffies_till_stall_check();
-	if (jtsq > rjtsc / 2) {
-		WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
-		jtsq = rjtsc / 2;
-	} else if (jtsq < 1) {
-		WRITE_ONCE(jiffies_till_sched_qs, 1);
-		jtsq = 1;
-	}
-
 	/*
 	 * Has this CPU encountered a cond_resched_rcu_qs() since the
 	 * beginning of the grace period?  For this to be the case,
 	 * the CPU has to have noticed the current grace period.  This
 	 * might not be the case for nohz_full CPUs looping in the kernel.
 	 */
+	jtsq = jiffies_till_sched_qs;
 	rnp = rdp->mynode;
 	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
 	if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
@@ -1276,7 +1265,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	    READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
 		return 1;
-	} else {
+	} else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
 		/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
 		smp_store_release(ruqp, true);
 	}
@@ -1304,10 +1293,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * updates are only once every few jiffies, the probability of
 	 * lossage (and thus of slight grace-period extension) is
 	 * quite low.
-	 *
-	 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
-	 * is set too high, we override with half of the RCU CPU stall
-	 * warning delay.
 	 */
 	rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
 	if (!READ_ONCE(*rnhqp) &&
@@ -1316,7 +1301,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		WRITE_ONCE(*rnhqp, true);
 		/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
 		smp_store_release(ruqp, true);
-		rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+		rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
 	}
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..904d3ab35c83 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4842,6 +4842,7 @@ int __sched _cond_resched(void)
 		preempt_schedule_common();
 		return 1;
 	}
+	rcu_all_qs();
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
-- 
cgit v1.3-14-g43fede


From 9b9500da81502738efa1b485a8835f174ff7be6d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 17 Aug 2017 17:05:59 -0700
Subject: rcu: Make RCU CPU stall warnings check for irq-disabled CPUs

One common question upon seeing an RCU CPU stall warning is "did
the stalled CPUs have interrupts disabled?"  However, the current
stall warnings are silent on this point.  This commit therefore
uses irq_work to check whether stalled CPUs still respond to IPIs,
and flags this state in the RCU CPU stall warning console messages.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 104 +++++++++++++++++++++++++++++++++++++++++------
 kernel/rcu/tree.h        |   5 +++
 kernel/rcu/tree_plugin.h |   7 +++-
 3 files changed, 103 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0dda57a28276..12838a9a128e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1206,6 +1206,22 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 	return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
 }
 
+/*
+ * We are reporting a quiescent state on behalf of some other CPU, so
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gpnum counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
+ */
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+	lockdep_assert_held(&rnp->lock);
+	if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
+		WRITE_ONCE(rdp->gpwrap, true);
+	if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
+		rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4;
+}
+
 /*
  * Snapshot the specified CPU's dynticks counter so that we can later
  * credit them with an implicit quiescent state.  Return 1 if this CPU
@@ -1216,14 +1232,33 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 	rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
 	if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
-		if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
-				 rdp->mynode->gpnum))
-			WRITE_ONCE(rdp->gpwrap, true);
+		rcu_gpnum_ovf(rdp->mynode, rdp);
 		return 1;
 	}
 	return 0;
 }
 
+/*
+ * Handler for the irq_work request posted when a grace period has
+ * gone on for too long, but not yet long enough for an RCU CPU
+ * stall warning.  Set state appropriately, but just complain if
+ * there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+
+	rdp = container_of(iwp, struct rcu_data, rcu_iw);
+	rnp = rdp->mynode;
+	raw_spin_lock_rcu_node(rnp);
+	if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+		rdp->rcu_iw_gpnum = rnp->gpnum;
+		rdp->rcu_iw_pending = false;
+	}
+	raw_spin_unlock_rcu_node(rnp);
+}
+
 /*
  * Return true if the specified CPU has passed through a quiescent
  * state by virtue of being in or having passed through an dynticks
@@ -1235,7 +1270,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	unsigned long jtsq;
 	bool *rnhqp;
 	bool *ruqp;
-	struct rcu_node *rnp;
+	struct rcu_node *rnp = rdp->mynode;
 
 	/*
 	 * If the CPU passed through or entered a dynticks idle phase with
@@ -1248,6 +1283,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
 		rdp->dynticks_fqs++;
+		rcu_gpnum_ovf(rnp, rdp);
 		return 1;
 	}
 
@@ -1258,12 +1294,12 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	 * might not be the case for nohz_full CPUs looping in the kernel.
 	 */
 	jtsq = jiffies_till_sched_qs;
-	rnp = rdp->mynode;
 	ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
 	if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
 	    READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
 	    READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+		rcu_gpnum_ovf(rnp, rdp);
 		return 1;
 	} else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
 		/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
@@ -1274,6 +1310,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
 		trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
 		rdp->offline_fqs++;
+		rcu_gpnum_ovf(rnp, rdp);
 		return 1;
 	}
 
@@ -1305,11 +1342,22 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 	}
 
 	/*
-	 * If more than halfway to RCU CPU stall-warning time, do
-	 * a resched_cpu() to try to loosen things up a bit.
+	 * If more than halfway to RCU CPU stall-warning time, do a
+	 * resched_cpu() to try to loosen things up a bit.  Also check to
+	 * see if the CPU is getting hammered with interrupts, but only
+	 * once per grace period, just to keep the IPIs down to a dull roar.
 	 */
-	if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+	if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) {
 		resched_cpu(rdp->cpu);
+		if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+		    !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum &&
+		    (rnp->ffmask & rdp->grpmask)) {
+			init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+			rdp->rcu_iw_pending = true;
+			rdp->rcu_iw_gpnum = rnp->gpnum;
+			irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+		}
+	}
 
 	return 0;
 }
@@ -1498,6 +1546,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 {
 	int cpu;
 	unsigned long flags;
+	struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	long totqlen = 0;
 
@@ -1513,7 +1562,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
 	 */
 	pr_err("INFO: %s self-detected stall on CPU", rsp->name);
 	print_cpu_stall_info_begin();
+	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
 	print_cpu_stall_info(rsp, smp_processor_id());
+	raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
 	print_cpu_stall_info_end();
 	for_each_possible_cpu(cpu)
 		totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
@@ -1907,6 +1958,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
 		rdp->core_needs_qs = need_gp;
 		zero_cpu_stall_ticks(rdp);
 		WRITE_ONCE(rdp->gpwrap, false);
+		rcu_gpnum_ovf(rnp, rdp);
 	}
 	return ret;
 }
@@ -3685,6 +3737,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->cpu_no_qs.b.norm = true;
 	rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
 	rdp->core_needs_qs = false;
+	rdp->rcu_iw_pending = false;
+	rdp->rcu_iw_gpnum = rnp->gpnum - 1;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 }
@@ -3722,10 +3776,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
  */
 int rcutree_online_cpu(unsigned int cpu)
 {
-	sync_sched_exp_online_cleanup(cpu);
-	rcutree_affinity_setting(cpu, -1);
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		rnp = rdp->mynode;
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		rnp->ffmask |= rdp->grpmask;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
 	if (IS_ENABLED(CONFIG_TREE_SRCU))
 		srcu_online_cpu(cpu);
+	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+		return 0; /* Too early in boot for scheduler work. */
+	sync_sched_exp_online_cleanup(cpu);
+	rcutree_affinity_setting(cpu, -1);
 	return 0;
 }
 
@@ -3735,6 +3803,19 @@ int rcutree_online_cpu(unsigned int cpu)
  */
 int rcutree_offline_cpu(unsigned int cpu)
 {
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
+
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		rnp = rdp->mynode;
+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
+		rnp->ffmask &= ~rdp->grpmask;
+		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	}
+
 	rcutree_affinity_setting(cpu, cpu);
 	if (IS_ENABLED(CONFIG_TREE_SRCU))
 		srcu_offline_cpu(cpu);
@@ -4183,8 +4264,7 @@ void __init rcu_init(void)
 	for_each_online_cpu(cpu) {
 		rcutree_prepare_cpu(cpu);
 		rcu_cpu_starting(cpu);
-		if (IS_ENABLED(CONFIG_TREE_SRCU))
-			srcu_online_cpu(cpu);
+		rcutree_online_cpu(cpu);
 	}
 }
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e1f285f0a70..46a5d1991450 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -103,6 +103,7 @@ struct rcu_node {
 				/* Online CPUs for next expedited GP. */
 				/*  Any CPU that has ever been online will */
 				/*  have its bit set. */
+	unsigned long ffmask;	/* Fully functional CPUs. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
@@ -285,6 +286,10 @@ struct rcu_data {
 
 	/* 8) RCU CPU stall data. */
 	unsigned int softirq_snap;	/* Snapshot of softirq activity. */
+	/* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+	struct irq_work rcu_iw;		/* Check for non-irq activity. */
+	bool rcu_iw_pending;		/* Is ->rcu_iw pending? */
+	unsigned long rcu_iw_gpnum;	/* ->gpnum associated with ->rcu_iw. */
 
 	int cpu;
 	struct rcu_state *rsp;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..14977d0470d1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1671,6 +1671,7 @@ static void print_cpu_stall_info_begin(void)
  */
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 {
+	unsigned long delta;
 	char fast_no_hz[72];
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_dynticks *rdtp = rdp->dynticks;
@@ -1685,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
 		ticks_value = rsp->gpnum - rdp->gpnum;
 	}
 	print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
-	pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+	delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
+	pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
 	       cpu,
 	       "O."[!!cpu_online(cpu)],
 	       "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
 	       "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+	       !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+			rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+				"!."[!delta],
 	       ticks_value, ticks_title,
 	       rcu_dynticks_snap(rdtp) & 0xfff,
 	       rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
-- 
cgit v1.3-14-g43fede


From 83b6ca1fede773eebcdfb44f5a94eb410d48b886 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 31 Aug 2017 16:47:08 -0700
Subject: rcu: Turn off tracing before dumping trace

Currently, RCU allows tracing to continue when it automatically does
ftrace_dump() after detecting an error condition, which can result in
excessively large traces and lost trace events.  This commit therefore
does a tracing_off() before any of these ftrace_dump() calls.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index e4b43fef89f5..b8729eb09a5d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -220,8 +220,10 @@ do { \
 	static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
 	\
 	if (!atomic_read(&___rfd_beenhere) && \
-	    !atomic_xchg(&___rfd_beenhere, 1)) \
+	    !atomic_xchg(&___rfd_beenhere, 1)) { \
+		tracing_off(); \
 		ftrace_dump(oops_dump_mode); \
+	} \
 } while (0)
 
 void rcu_early_boot_tests(void);
-- 
cgit v1.3-14-g43fede


From f22ce09157239aab08eae99c678ef664f71a9097 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 1 Sep 2017 14:40:54 -0700
Subject: rcu: Suppress RCU CPU stall warnings while dumping trace

Currently, RCU emits Suppress RCU CPU stall warnings during its
automatically initiated ftrace_dump() calls after detecting an error
condition, which can result in excessively excessive console output
and lost trace events.  This commit therefore suppresses RCU CPU stall
warnings across any of these ftrace_dump() calls.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h    | 17 +++++++++++++++++
 kernel/rcu/update.c |  1 +
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index b8729eb09a5d..59c471de342a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 extern int rcu_cpu_stall_suppress;
 int rcu_jiffies_till_stall_check(void);
 
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+	if (!rcu_cpu_stall_suppress) \
+		rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+	if (rcu_cpu_stall_suppress == 3) \
+		rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
 
 /*
@@ -222,7 +237,9 @@ do { \
 	if (!atomic_read(&___rfd_beenhere) && \
 	    !atomic_xchg(&___rfd_beenhere, 1)) { \
 		tracing_off(); \
+		rcu_ftrace_dump_stall_suppress(); \
 		ftrace_dump(oops_dump_mode); \
+		rcu_ftrace_dump_stall_unsuppress(); \
 	} \
 } while (0)
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..3dc8efb16dc7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -494,6 +494,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
 #endif
 
 int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
 static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
 
 module_param(rcu_cpu_stall_suppress, int, 0644);
-- 
cgit v1.3-14-g43fede


From 2b1516e55f8416acfb48d5f43d41222d180fb5a3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 18 Aug 2017 16:11:37 -0700
Subject: rcutorture: Add interrupt-disable capability to stall-warning tests

When rcutorture sees the rcutorture.stall_cpu kernel boot parameter,
it loops with preemption disabled, which does in fact normally
generate an RCU CPU stall warning message.  However, there are test
scenarios that need the stalling CPU to have interrupts disabled.
This commit therefore adds an rcutorture.stall_cpu_irqsoff kernel
boot parameter that causes the stalling CPU to disable interrupts.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 +++
 kernel/rcu/rcutorture.c                         | 18 +++++++++++++-----
 2 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..bc94c47085a5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3539,6 +3539,9 @@
 	rcutorture.stall_cpu_holdoff= [KNL]
 			Time to wait (s) after boot before inducing stall.
 
+	rcutorture.stall_cpu_irqsoff= [KNL]
+			Disable interrupts while stalling if set.
+
 	rcutorture.stat_interval= [KNL]
 			Time (s) between statistics printk()s.
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45f2ffbc1e78..0273bc0a8586 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -89,6 +89,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
 torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
 torture_param(int, stall_cpu_holdoff, 10,
 	     "Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
 torture_param(int, stat_interval, 60,
 	     "Number of seconds between stats printk()s");
 torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -1357,7 +1358,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
 		 "test_boost=%d/%d test_boost_interval=%d "
 		 "test_boost_duration=%d shutdown_secs=%d "
-		 "stall_cpu=%d stall_cpu_holdoff=%d "
+		 "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
 		 "n_barrier_cbs=%d "
 		 "onoff_interval=%d onoff_holdoff=%d\n",
 		 torture_type, tag, nrealreaders, nfakewriters,
@@ -1365,7 +1366,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
 		 test_boost, cur_ops->can_boost,
 		 test_boost_interval, test_boost_duration, shutdown_secs,
-		 stall_cpu, stall_cpu_holdoff,
+		 stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
 		 n_barrier_cbs,
 		 onoff_interval, onoff_holdoff);
 }
@@ -1430,12 +1431,19 @@ static int rcu_torture_stall(void *args)
 	if (!kthread_should_stop()) {
 		stop_at = get_seconds() + stall_cpu;
 		/* RCU CPU stall is expected behavior in following code. */
-		pr_alert("rcu_torture_stall start.\n");
 		rcu_read_lock();
-		preempt_disable();
+		if (stall_cpu_irqsoff)
+			local_irq_disable();
+		else
+			preempt_disable();
+		pr_alert("rcu_torture_stall start on CPU %d.\n",
+			 smp_processor_id());
 		while (ULONG_CMP_LT(get_seconds(), stop_at))
 			continue;  /* Induce RCU CPU stall warning. */
-		preempt_enable();
+		if (stall_cpu_irqsoff)
+			local_irq_enable();
+		else
+			preempt_enable();
 		rcu_read_unlock();
 		pr_alert("rcu_torture_stall end.\n");
 	}
-- 
cgit v1.3-14-g43fede


From 0032f4e889764d22ccccb6a15742071d6f0d1f5a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 30 Aug 2017 10:40:17 -0700
Subject: rcutorture: Dump writer stack if stalled

Right now, rcutorture warns if an rcu_torture_writer() kthread stalls,
but this warning is not always all that helpful.  This commit therefore
makes the first such warning include a stack dump.

This in turn requires that sched_show_task() be exported to GPL modules,
so this commit makes that change as well.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 6 ++++++
 kernel/sched/core.c     | 1 +
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 0273bc0a8586..362eb2f78b3c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
 #include <asm/byteorder.h>
 #include <linux/torture.h>
 #include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
 
 #include "rcu.h"
 
@@ -1240,6 +1241,7 @@ rcu_torture_stats_print(void)
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	static unsigned long rtcv_snap = ULONG_MAX;
+	static bool splatted;
 	struct task_struct *wtp;
 
 	for_each_possible_cpu(cpu) {
@@ -1325,6 +1327,10 @@ rcu_torture_stats_print(void)
 			 gpnum, completed, flags,
 			 wtp == NULL ? ~0UL : wtp->state,
 			 wtp == NULL ? -1 : (int)task_cpu(wtp));
+		if (!splatted && wtp) {
+			sched_show_task(wtp);
+			splatted = true;
+		}
 		show_rcu_gp_kthreads();
 		rcu_ftrace_dump(DUMP_ALL);
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..7ae0151dcc1d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5165,6 +5165,7 @@ void sched_show_task(struct task_struct *p)
 	show_stack(p, NULL);
 	put_task_stack(p);
 }
+EXPORT_SYMBOL_GPL(sched_show_task);
 
 static inline bool
 state_filter_match(unsigned long state_filter, struct task_struct *p)
-- 
cgit v1.3-14-g43fede


From 62cb1188ed86a9cf082fd2f757d4dd9b54741f24 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 29 Aug 2017 15:07:54 +0200
Subject: sched/idle: Move quiet_vmstate() into the NOHZ code

quiet_vmstat() is an expensive function that only makes sense when we
go into NOHZ.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aubrey.li@linux.intel.com
Cc: cl@linux.com
Cc: fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c      | 1 -
 kernel/time/tick-sched.c | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 257f4f0b4532..b2e8f0afbb42 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,7 +219,6 @@ static void do_idle(void)
 	 */
 
 	__current_set_polling();
-	quiet_vmstat();
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c7a899c5ce64..7b258c59d78a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
 #include <linux/context_tracking.h>
+#include <linux/mm.h>
 
 #include <asm/irq_regs.h>
 
@@ -787,6 +788,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	if (!ts->tick_stopped) {
 		calc_load_nohz_start();
 		cpu_load_update_nohz_start();
+		quiet_vmstat();
 
 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
 		ts->tick_stopped = 1;
-- 
cgit v1.3-14-g43fede


From 1d48b080bcce0a5e7d7aa2dbcdb35deefc188c3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 Sep 2017 13:50:16 +0200
Subject: sched/debug: Rename task-state printing helpers

Steve requested better names for the new task-state helper functions.

So introduce the concept of task-state index for the printing and
rename __get_task_state() to task_state_index() and
__task_state_to_char() to task_index_to_char().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929115016.pzlqc7ss3ccystyg@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 fs/proc/array.c                   |  2 +-
 include/linux/sched.h             |  6 +++---
 include/trace/events/sched.h      |  2 +-
 kernel/trace/trace_output.c       | 12 ++++++------
 kernel/trace/trace_sched_wakeup.c |  8 ++++----
 5 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..1b5406ca8e4c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -137,7 +137,7 @@ static const char * const task_state_array[] = {
 static inline const char *get_task_state(struct task_struct *tsk)
 {
 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
-	return task_state_array[__get_task_state(tsk)];
+	return task_state_array[task_state_index(tsk)];
 }
 
 static inline int get_task_umask(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index bdd6ad6fcce1..33a01f4deb00 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1248,7 +1248,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
 
-static inline unsigned int __get_task_state(struct task_struct *tsk)
+static inline unsigned int task_state_index(struct task_struct *tsk)
 {
 	unsigned int tsk_state = READ_ONCE(tsk->state);
 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@@ -1261,7 +1261,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
 	return fls(state);
 }
 
-static inline char __task_state_to_char(unsigned int state)
+static inline char task_index_to_char(unsigned int state)
 {
 	static const char state_char[] = "RSDTtXZPI";
 
@@ -1272,7 +1272,7 @@ static inline char __task_state_to_char(unsigned int state)
 
 static inline char task_state_to_char(struct task_struct *tsk)
 {
-	return __task_state_to_char(__get_task_state(tsk));
+	return task_index_to_char(task_state_index(tsk));
 }
 
 /**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 3c8b7f625670..fab74a12ca0f 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -117,7 +117,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 	if (preempt)
 		return TASK_STATE_MAX;
 
-	return __get_task_state(p);
+	return task_state_index(p);
 }
 #endif /* CREATE_TRACE_POINTS */
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c738e764e2a5..90db994ac900 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	T = __task_state_to_char(field->next_state);
-	S = __task_state_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
+	S = task_index_to_char(field->prev_state);
 	trace_find_cmdline(field->next_pid, comm);
 	trace_seq_printf(&iter->seq,
 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			 field->prev_pid,
 			 field->prev_prio,
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		S = __task_state_to_char(field->prev_state);
-	T = __task_state_to_char(field->next_state);
+		S = task_index_to_char(field->prev_state);
+	T = task_index_to_char(field->next_state);
 
 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0c331978b1a6..500f370d3bb1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
-	entry->prev_state		= __get_task_state(prev);
+	entry->prev_state		= task_state_index(prev);
 	entry->next_pid			= next->pid;
 	entry->next_prio		= next->prio;
-	entry->next_state		= __get_task_state(next);
+	entry->next_state		= task_state_index(next);
 	entry->next_cpu	= task_cpu(next);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
-	entry->prev_state		= __get_task_state(curr);
+	entry->prev_state		= task_state_index(curr);
 	entry->next_pid			= wakee->pid;
 	entry->next_prio		= wakee->prio;
-	entry->next_state		= __get_task_state(wakee);
+	entry->next_state		= task_state_index(wakee);
 	entry->next_cpu			= task_cpu(wakee);
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
-- 
cgit v1.3-14-g43fede


From e964d3501b64d6930aaa4dd18955a8cd086ccb92 Mon Sep 17 00:00:00 2001
From: luca abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:28 +0200
Subject: sched/headers: Remove duplicate prototype of __dl_clear_params()

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-2-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e83d1b8be611..9d5aa18fc9bf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -255,7 +255,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
-extern void __dl_clear_params(struct task_struct *p);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
 extern int dl_task_can_attach(struct task_struct *p,
 			      const struct cpumask *cs_cpus_allowed);
-- 
cgit v1.3-14-g43fede


From 295d6d5e373607729bcc8182c25afe964655714f Mon Sep 17 00:00:00 2001
From: Luca Abeni <luca.abeni@santannapisa.it>
Date: Thu, 7 Sep 2017 12:09:29 +0200
Subject: sched/deadline: Fix switching to -deadline

Fix a bug introduced in:

  72f9f3fdc928 ("sched/deadline: Remove dl_new from struct sched_dl_entity")

After that commit, when switching to -deadline if the scheduling
deadline of a task is in the past then switched_to_dl() calls
setup_new_entity() to properly initialize the scheduling deadline
and runtime.

The problem is that the task is enqueued _before_ having its parameters
initialized by setup_new_entity(), and this can cause problems.
For example, a task with its out-of-date deadline in the past will
potentially be enqueued as the highest priority one; however, its
adjusted deadline may not be the earliest one.

This patch fixes the problem by initializing the task's parameters before
enqueuing it.

Signed-off-by: luca abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-3-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0191ec7667c3..9d45354e4296 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1364,6 +1364,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 		update_dl_entity(dl_se, pi_se);
 	} else if (flags & ENQUEUE_REPLENISH) {
 		replenish_dl_entity(dl_se, pi_se);
+	} else if ((flags & ENQUEUE_RESTORE) &&
+		  dl_time_before(dl_se->deadline,
+				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+		setup_new_dl_entity(dl_se);
 	}
 
 	__enqueue_dl_entity(dl_se);
@@ -2255,13 +2259,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
 		return;
 	}
-	/*
-	 * If p is boosted we already updated its params in
-	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
-	 * p's deadline being now already after rq_clock(rq).
-	 */
-	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-		setup_new_dl_entity(&p->dl);
 
 	if (rq->curr != p) {
 #ifdef CONFIG_SMP
-- 
cgit v1.3-14-g43fede


From 8c0944cee7af55291df0b28e6e2eeac0930e93c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 7 Sep 2017 12:09:30 +0200
Subject: sched/deadline: Rename __dl_clear() to __dl_sub()

__dl_sub() is more meaningful as a name, and is more consistent
with the naming of the dual function (__dl_add()).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Luca Abeni <luca.abeni@santannapisa.it>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mathieu Poirier <mathieu.poirier@linaro.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1504778971-13573-4-git-send-email-luca.abeni@santannapisa.it
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 10 +++++-----
 kernel/sched/sched.h    |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9d45354e4296..8d1b946fa684 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -242,7 +242,7 @@ static void task_non_contending(struct task_struct *p)
 			if (p->state == TASK_DEAD)
 				sub_rq_bw(p->dl.dl_bw, &rq->dl);
 			raw_spin_lock(&dl_b->lock);
-			__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 			__dl_clear_params(p);
 			raw_spin_unlock(&dl_b->lock);
 		}
@@ -1209,7 +1209,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 		}
 
 		raw_spin_lock(&dl_b->lock);
-		__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&dl_b->lock);
 		__dl_clear_params(p);
 
@@ -2170,7 +2170,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 		 * until we complete the update.
 		 */
 		raw_spin_lock(&src_dl_b->lock);
-		__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+		__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
 		raw_spin_unlock(&src_dl_b->lock);
 	}
 
@@ -2448,7 +2448,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
 		if (hrtimer_active(&p->dl.inactive_timer))
-			__dl_clear(dl_b, p->dl.dl_bw, cpus);
+			__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		err = 0;
 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2460,7 +2460,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 		 * But this would require to set the task's "inactive
 		 * timer" when the task is not inactive.
 		 */
-		__dl_clear(dl_b, p->dl.dl_bw, cpus);
+		__dl_sub(dl_b, p->dl.dl_bw, cpus);
 		__dl_add(dl_b, new_bw, cpus);
 		dl_change_utilization(p, new_bw);
 		err = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9d5aa18fc9bf..a81c9782e98c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -226,7 +226,7 @@ struct dl_bw {
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
 
 static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
 {
 	dl_b->total_bw -= tsk_bw;
 	__dl_update(dl_b, (s32)tsk_bw / cpus);
-- 
cgit v1.3-14-g43fede


From ed4ad1ca08a53cf1a805478678d1e7ff0d2cf251 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 2 Oct 2017 14:50:33 +0200
Subject: sched/topology: Restore SD_PREFER_SIBLING on MC domains

The normal x86_topology on NHM+ machines degenerates because the MC
and CPU domains are of the same size, therefore MC inherits
SD_PREFER_SIBLING from CPU (which then gets taken out). The result is
that we'll spread tasks across the first NUMA level in order to
maximize cache utilization.

However, for the x86_numa_in_package_topology we loose the CPU domain,
and we'll not have SD_PREFER_SIBLING set anywhere, giving a distinct
difference in behaviour.

Commit:

  8e7fbcbc22c1 ("sched: Remove stale power aware scheduling remnants and dysfunctional knobs")

made a fail by not preserving the SD_PREFER_SIBLING for the !power_saving
case on both CPU and MC.

Then commit:

  6956dc568f34 ("sched/numa: Add SD_PERFER_SIBLING to CPU domain")

adds it back to the CPU but not MC.

Restore that now, such that we get consistent spreading behaviour wrt
L3 and NUMA.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f306a82..86e81f06d36b 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1157,6 +1157,7 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->smt_gain = 1178; /* ~15% */
 
 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->flags |= SD_PREFER_SIBLING;
 		sd->imbalance_pct = 117;
 		sd->cache_nice_tries = 1;
 		sd->busy_idx = 2;
-- 
cgit v1.3-14-g43fede


From 051f3ca02e46432c0965e8948f00c07d8a2f09c0 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Thu, 7 Sep 2017 02:20:05 -0500
Subject: sched/topology: Introduce NUMA identity node sched domain

On AMD Family17h-based (EPYC) system, a logical NUMA node can contain
upto 8 cores (16 threads) with the following topology.

             ----------------------------
         C0  | T0 T1 |    ||    | T0 T1 | C4
             --------|    ||    |--------
         C1  | T0 T1 | L3 || L3 | T0 T1 | C5
             --------|    ||    |--------
         C2  | T0 T1 | #0 || #1 | T0 T1 | C6
             --------|    ||    |--------
         C3  | T0 T1 |    ||    | T0 T1 | C7
             ----------------------------

Here, there are 2 last-level (L3) caches per logical NUMA node.
A socket can contain upto 4 NUMA nodes, and a system can support
upto 2 sockets. With full system configuration, current scheduler
creates 4 sched domains:

  domain0 SMT       (span a core)
  domain1 MC        (span a last-level-cache)
  domain2 NUMA      (span a socket: 4 nodes)
  domain3 NUMA      (span a system: 8 nodes)

Note that there is no domain to represent cpus spaning a logical
NUMA node.  With this hierarchy of sched domains, the scheduler does
not balance properly in the following cases:

Case1:

 When running 8 tasks, a properly balanced system should
 schedule a task per logical NUMA node. This is not the case for
 the current scheduler.

Case2:

 In some cases, threads are scheduled on the same cpu, while other
 cpus are idle. This results in run-to-run inconsistency. For example:

  taskset -c 0-7 sysbench --num-threads=8 --test=cpu \
                          --cpu-max-prime=100000 run

Total execution time ranges from 25.1s to 33.5s depending on threads
placement, where 25.1s is when all 8 threads are balanced properly
on 8 cpus.

Introducing NUMA identity node sched domain, which is based on how
SRAT/SLIT table define a logical NUMA node. This results in the following
hierarchy of sched domains on the same system described above.

  domain0 SMT       (span a core)
  domain1 MC        (span a last-level-cache)
  domain2 NODE      (span a logical NUMA node)
  domain3 NUMA      (span a socket: 4 nodes)
  domain4 NUMA      (span a system: 8 nodes)

This fixes the improper load balancing cases mentioned above.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Link: http://lkml.kernel.org/r/1504768805-46716-1-git-send-email-suravee.suthikulpanit@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/topology.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 86e81f06d36b..f51d123f9fe1 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1332,6 +1332,10 @@ void sched_init_numa(void)
 	if (!sched_domains_numa_distance)
 		return;
 
+	/* Includes NUMA identity node at level 0. */
+	sched_domains_numa_distance[level++] = curr_distance;
+	sched_domains_numa_levels = level;
+
 	/*
 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
 	 * unique distances in the node_distance() table.
@@ -1379,8 +1383,7 @@ void sched_init_numa(void)
 		return;
 
 	/*
-	 * 'level' contains the number of unique distances, excluding the
-	 * identity distance node_distance(i,i).
+	 * 'level' contains the number of unique distances
 	 *
 	 * The sched_domains_numa_distance[] array includes the actual distance
 	 * numbers.
@@ -1441,10 +1444,19 @@ void sched_init_numa(void)
 	for (i = 0; sched_domain_topology[i].mask; i++)
 		tl[i] = sched_domain_topology[i];
 
+	/*
+	 * Add the NUMA identity distance, aka single NODE.
+	 */
+	tl[i++] = (struct sched_domain_topology_level){
+		.mask = sd_numa_mask,
+		.numa_level = 0,
+		SD_INIT_NAME(NODE)
+	};
+
 	/*
 	 * .. and append 'j' levels of NUMA goodness.
 	 */
-	for (j = 0; j < level; i++, j++) {
+	for (j = 1; j < level; i++, j++) {
 		tl[i] = (struct sched_domain_topology_level){
 			.mask = sd_numa_mask,
 			.sd_flags = cpu_numa_flags,
-- 
cgit v1.3-14-g43fede


From 93824900a2e242766f5fe6ae7697e3d7171aa234 Mon Sep 17 00:00:00 2001
From: Uladzislau Rezki <urezki@gmail.com>
Date: Wed, 13 Sep 2017 12:24:30 +0200
Subject: sched/fair: Search a task from the tail of the queue

As a first step this patch makes cfs_tasks list as MRU one.
It means, that when a next task is picked to run on physical
CPU it is moved to the front of the list.

Therefore, the cfs_tasks list is more or less sorted (except
woken tasks) starting from recently given CPU time tasks toward
tasks with max wait time in a run-queue, i.e. MRU list.

Second, as part of the load balance operation, this approach
starts detach_tasks()/detach_one_task() from the tail of the
queue instead of the head, giving some advantages:

 - tends to pick a task with highest wait time;

 - tasks located in the tail are less likely cache-hot,
   therefore the can_migrate_task() decision is higher.

hackbench illustrates slightly better performance. For example
doing 1000 samples and 40 groups on i5-3320M CPU, it shows below
figures:

 default: 0.657 avg
 patched: 0.646 avg

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Oleksiy Avramchenko <oleksiy.avramchenko@sonymobile.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: http://lkml.kernel.org/r/20170913102430.8985-2-urezki@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac6602c5f36f..cc0bfb035df9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6628,10 +6628,7 @@ again:
 		set_next_entity(cfs_rq, se);
 	}
 
-	if (hrtick_enabled(rq))
-		hrtick_start_fair(rq, p);
-
-	return p;
+	goto done;
 simple:
 #endif
 
@@ -6645,6 +6642,16 @@ simple:
 
 	p = task_of(se);
 
+done: __maybe_unused
+#ifdef CONFIG_SMP
+	/*
+	 * Move the next running task to the front of
+	 * the list, so our cfs_tasks list becomes MRU
+	 * one.
+	 */
+	list_move(&p->se.group_node, &rq->cfs_tasks);
+#endif
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -7080,11 +7087,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  */
 static struct task_struct *detach_one_task(struct lb_env *env)
 {
-	struct task_struct *p, *n;
+	struct task_struct *p;
 
 	lockdep_assert_held(&env->src_rq->lock);
 
-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+	list_for_each_entry_reverse(p,
+			&env->src_rq->cfs_tasks, se.group_node) {
 		if (!can_migrate_task(p, env))
 			continue;
 
@@ -7130,7 +7138,7 @@ static int detach_tasks(struct lb_env *env)
 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
 			break;
 
-		p = list_first_entry(tasks, struct task_struct, se.group_node);
+		p = list_last_entry(tasks, struct task_struct, se.group_node);
 
 		env->loop++;
 		/* We've more or less seen every task there is, call it quits */
@@ -7180,7 +7188,7 @@ static int detach_tasks(struct lb_env *env)
 
 		continue;
 next:
-		list_move_tail(&p->se.group_node, tasks);
+		list_move(&p->se.group_node, tasks);
 	}
 
 	/*
-- 
cgit v1.3-14-g43fede


From ea16f0ea6c3dc9e1aa083bd3d1792ba02860526e Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 11:55:51 +0100
Subject: sched/fair: Sync task util before slow-path wakeup

We use task_util() in find_idlest_group() via capacity_spare_wake().
This task_util() updated in wake_cap(). However wake_cap() is not the
only reason for ending up in find_idlest_group() - we could have been sent
there by wake_wide(). So explicitly sync the task util with prev_cpu
when we are about to head to find_idlest_group().

We could simply do this at the beginning of
select_task_rq_fair() (i.e. irrespective of whether we're heading to
select_idle_sibling() or find_idlest_group() & co), but I didn't want to
slow down the select_idle_sibling() path more than necessary.

Don't do this during fork balancing, we won't need the task_util and
we'd just clobber the last_update_time, which is supposed to be 0.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andres Oportus <andresoportus@google.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Joel Fernandes <joelaf@google.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20170808095519.10077-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cc0bfb035df9..c04a42556a59 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6263,8 +6263,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 			new_cpu = cpu;
 	}
 
+	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+		/*
+		 * We're going to need the task's util for capacity_spare_wake
+		 * in find_idlest_group. Sync it up to prev_cpu's
+		 * last_update_time.
+		 */
+		sync_entity_load_avg(&p->se);
+	}
+
 	if (!sd) {
- pick_cpu:
+pick_cpu:
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-- 
cgit v1.3-14-g43fede


From 583ffd99d7657755736d831bbc182612d1d2697d Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 11:58:54 +0100
Subject: sched/fair: Force balancing on NOHZ balance if local group has
 capacity

The "goto force_balance" here is intended to mitigate the fact that
avg_load calculations can result in bad placement decisions when
priority is asymmetrical.

The original commit that adds it:

  fab476228ba3 ("sched: Force balancing on newidle balance if local group has capacity")

explains:

    Under certain situations, such as a niced down task (i.e. nice =
    -15) in the presence of nr_cpus NICE0 tasks, the niced task lands
    on a sched group and kicks away other tasks because of its large
    weight. This leads to sub-optimal utilization of the
    machine. Even though the sched group has capacity, it does not
    pull tasks because sds.this_load >> sds.max_load, and f_b_g()
    returns NULL.

A similar but inverted issue also affects ARM big.LITTLE (asymmetrical CPU
capacity) systems - consider 8 always-running, same-priority tasks on a
system with 4 "big" and 4 "little" CPUs. Suppose that 5 of them end up on
the "big" CPUs (which will be represented by one sched_group in the DIE
sched_domain) and 3 on the "little" (the other sched_group in DIE), leaving
one CPU unused. Because the "big" group has a higher group_capacity its
avg_load may not present an imbalance that would cause migrating a
task to the idle "little".

The force_balance case here solves the problem but currently only for
CPU_NEWLY_IDLE balances, which in theory might never happen on the
unused CPU. Including CPU_IDLE in the force_balance case means
there's an upper bound on the time before we can attempt to solve the
underutilization: after DIE's sd->balance_interval has passed the
next nohz balance kick will help us out.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170807163900.25180-1-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c04a42556a59..cf3e816db80f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8186,8 +8186,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
-	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+	/*
+	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+	 * capacities from resulting in underutilization due to avg_load.
+	 */
+	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
 	    busiest->group_no_capacity)
 		goto force_balance;
 
-- 
cgit v1.3-14-g43fede


From 18bd1b4bd53aba81d76d55e91a68310a227dc187 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:12 +0100
Subject: sched/fair: Move select_task_rq_fair() slow-path into its own
 function

In preparation for changes that would otherwise require adding a new
level of indentation to the while(sd) loop, create a new function
find_idlest_cpu() which contains this loop, and rename the existing
find_idlest_cpu() to find_idlest_group_cpu().

Code inside the while(sd) loop is unchanged. @new_cpu is added as a
variable in the new function, with the same initial value as the
@new_cpu in select_task_rq_fair().

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-2-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 48 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cf3e816db80f..dd4f25305604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5859,10 +5859,10 @@ skip_spare:
 }
 
 /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
  */
 static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
 	unsigned long load, min_load = ULONG_MAX;
 	unsigned int min_exit_latency = UINT_MAX;
@@ -5911,6 +5911,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+				  int cpu, int prev_cpu, int sd_flag)
+{
+	int new_cpu = prev_cpu;
+
+	while (sd) {
+		struct sched_group *group;
+		struct sched_domain *tmp;
+		int weight;
+
+		if (!(sd->flags & sd_flag)) {
+			sd = sd->child;
+			continue;
+		}
+
+		group = find_idlest_group(sd, p, cpu, sd_flag);
+		if (!group) {
+			sd = sd->child;
+			continue;
+		}
+
+		new_cpu = find_idlest_group_cpu(group, p, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			/* Now try balancing at a lower domain level of cpu */
+			sd = sd->child;
+			continue;
+		}
+
+		/* Now try balancing at a lower domain level of new_cpu */
+		cpu = new_cpu;
+		weight = sd->span_weight;
+		sd = NULL;
+		for_each_domain(cpu, tmp) {
+			if (weight <= tmp->span_weight)
+				break;
+			if (tmp->flags & sd_flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return new_cpu;
+}
+
 #ifdef CONFIG_SCHED_SMT
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6277,39 +6321,8 @@ pick_cpu:
 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
 
-	} else while (sd) {
-		struct sched_group *group;
-		int weight;
-
-		if (!(sd->flags & sd_flag)) {
-			sd = sd->child;
-			continue;
-		}
-
-		group = find_idlest_group(sd, p, cpu, sd_flag);
-		if (!group) {
-			sd = sd->child;
-			continue;
-		}
-
-		new_cpu = find_idlest_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			/* Now try balancing at a lower domain level of cpu */
-			sd = sd->child;
-			continue;
-		}
-
-		/* Now try balancing at a lower domain level of new_cpu */
-		cpu = new_cpu;
-		weight = sd->span_weight;
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (weight <= tmp->span_weight)
-				break;
-			if (tmp->flags & sd_flag)
-				sd = tmp;
-		}
-		/* while loop will break here if sd == NULL */
+	} else {
+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
 	}
 	rcu_read_unlock();
 
-- 
cgit v1.3-14-g43fede


From e90381eaecf6d59c60fe396838e0e99789531419 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:13 +0100
Subject: sched/fair: Remove unnecessary comparison with -1

Since commit:

  83a0a96a5f26 ("sched/fair: Leverage the idle state info when choosing the "idlest" cpu")

find_idlest_group_cpu() (formerly find_idlest_cpu) no longer returns -1,
so we can simplify the checking of the return value in find_idlest_cpu().

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-3-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd4f25305604..bdd28fbad7a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5933,7 +5933,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 		}
 
 		new_cpu = find_idlest_group_cpu(group, p, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
+		if (new_cpu == cpu) {
 			/* Now try balancing at a lower domain level of cpu */
 			sd = sd->child;
 			continue;
-- 
cgit v1.3-14-g43fede


From 0d10ab952e99f3e9f374898e93f45452b81e5711 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:14 +0100
Subject: sched/fair: Fix find_idlest_group() when local group is not allowed

When the local group is not allowed we do not modify this_*_load from
their initial value of 0. That means that the load checks at the end
of find_idlest_group cause us to incorrectly return NULL. Fixing the
initial values to ULONG_MAX means we will instead return the idlest
remote group in that case.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-4-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bdd28fbad7a3..cca1835efa7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5737,8 +5737,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 {
 	struct sched_group *idlest = NULL, *group = sd->groups;
 	struct sched_group *most_spare_sg = NULL;
-	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
-	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+	unsigned long min_runnable_load = ULONG_MAX;
+	unsigned long this_runnable_load = ULONG_MAX;
+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
 	unsigned long most_spare = 0, this_spare = 0;
 	int load_idx = sd->forkexec_idx;
 	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
-- 
cgit v1.3-14-g43fede


From 6fee85ccbc76e8aeba43dc120c5fa3c5409a4e2c Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:15 +0100
Subject: sched/fair: Fix usage of find_idlest_group() when no groups are
 allowed

When 'p' is not allowed on any of the CPUs in the sched_domain, we
currently return NULL from find_idlest_group(), and pointlessly
continue the search on lower sched_domain levels (where 'p' is also not
allowed) before returning prev_cpu regardless (as we have not updated
new_cpu).

Add an explicit check for this case, and add a comment to
find_idlest_group(). Now when find_idlest_group() returns NULL, it always
means that the local group is allowed and idlest.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-5-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cca1835efa7b..ed80d6bd76c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5730,6 +5730,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
  */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5917,6 +5919,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
 	int new_cpu = prev_cpu;
 
+	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+		return prev_cpu;
+
 	while (sd) {
 		struct sched_group *group;
 		struct sched_domain *tmp;
-- 
cgit v1.3-14-g43fede


From 93f50f90247e3e926bbe9830df089c64a5cec236 Mon Sep 17 00:00:00 2001
From: Brendan Jackman <brendan.jackman@arm.com>
Date: Thu, 5 Oct 2017 12:45:16 +0100
Subject: sched/fair: Fix usage of find_idlest_group() when the local group is
 idlest

find_idlest_group() returns NULL when the local group is idlest. The
caller then continues the find_idlest_group() search at a lower level
of the current CPU's sched_domain hierarchy. find_idlest_group_cpu() is
not consulted and, crucially, @new_cpu is not updated. This means the
search is pointless and we return @prev_cpu from select_task_rq_fair().

This is fixed by initialising @new_cpu to @cpu instead of @prev_cpu.

Signed-off-by: Brendan Jackman <brendan.jackman@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Cc: Josef Bacik <josef@toxicpanda.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171005114516.18617-6-brendan.jackman@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ed80d6bd76c8..56f343b8e749 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5917,7 +5917,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
 				  int cpu, int prev_cpu, int sd_flag)
 {
-	int new_cpu = prev_cpu;
+	int new_cpu = cpu;
 
 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
 		return prev_cpu;
-- 
cgit v1.3-14-g43fede


From 4bdced5c9a2922521e325896a7bbbf0132c94e56 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 6 Oct 2017 14:05:04 -0400
Subject: sched/rt: Simplify the IPI based RT balancing logic

When a CPU lowers its priority (schedules out a high priority task for a
lower priority one), a check is made to see if any other CPU has overloaded
RT tasks (more than one). It checks the rto_mask to determine this and if so
it will request to pull one of those tasks to itself if the non running RT
task is of higher priority than the new priority of the next task to run on
the current CPU.

When we deal with large number of CPUs, the original pull logic suffered
from large lock contention on a single CPU run queue, which caused a huge
latency across all CPUs. This was caused by only having one CPU having
overloaded RT tasks and a bunch of other CPUs lowering their priority. To
solve this issue, commit:

  b6366f048e0c ("sched/rt: Use IPI to trigger RT task push migration instead of pulling")

changed the way to request a pull. Instead of grabbing the lock of the
overloaded CPU's runqueue, it simply sent an IPI to that CPU to do the work.

Although the IPI logic worked very well in removing the large latency build
up, it still could suffer from a large number of IPIs being sent to a single
CPU. On a 80 CPU box, I measured over 200us of processing IPIs. Worse yet,
when I tested this on a 120 CPU box, with a stress test that had lots of
RT tasks scheduling on all CPUs, it actually triggered the hard lockup
detector! One CPU had so many IPIs sent to it, and due to the restart
mechanism that is triggered when the source run queue has a priority status
change, the CPU spent minutes! processing the IPIs.

Thinking about this further, I realized there's no reason for each run queue
to send its own IPI. As all CPUs with overloaded tasks must be scanned
regardless if there's one or many CPUs lowering their priority, because
there's no current way to find the CPU with the highest priority task that
can schedule to one of these CPUs, there really only needs to be one IPI
being sent around at a time.

This greatly simplifies the code!

The new approach is to have each root domain have its own irq work, as the
rto_mask is per root domain. The root domain has the following fields
attached to it:

  rto_push_work	 - the irq work to process each CPU set in rto_mask
  rto_lock	 - the lock to protect some of the other rto fields
  rto_loop_start - an atomic that keeps contention down on rto_lock
		    the first CPU scheduling in a lower priority task
		    is the one to kick off the process.
  rto_loop_next	 - an atomic that gets incremented for each CPU that
		    schedules in a lower priority task.
  rto_loop	 - a variable protected by rto_lock that is used to
		    compare against rto_loop_next
  rto_cpu	 - The cpu to send the next IPI to, also protected by
		    the rto_lock.

When a CPU schedules in a lower priority task and wants to make sure
overloaded CPUs know about it. It increments the rto_loop_next. Then it
atomically sets rto_loop_start with a cmpxchg. If the old value is not "0",
then it is done, as another CPU is kicking off the IPI loop. If the old
value is "0", then it will take the rto_lock to synchronize with a possible
IPI being sent around to the overloaded CPUs.

If rto_cpu is greater than or equal to nr_cpu_ids, then there's either no
IPI being sent around, or one is about to finish. Then rto_cpu is set to the
first CPU in rto_mask and an IPI is sent to that CPU. If there's no CPUs set
in rto_mask, then there's nothing to be done.

When the CPU receives the IPI, it will first try to push any RT tasks that is
queued on the CPU but can't run because a higher priority RT task is
currently running on that CPU.

Then it takes the rto_lock and looks for the next CPU in the rto_mask. If it
finds one, it simply sends an IPI to that CPU and the process continues.

If there's no more CPUs in the rto_mask, then rto_loop is compared with
rto_loop_next. If they match, everything is done and the process is over. If
they do not match, then a CPU scheduled in a lower priority task as the IPI
was being passed around, and the process needs to start again. The first CPU
in rto_mask is sent the IPI.

This change removes this duplication of work in the IPI logic, and greatly
lowers the latency caused by the IPIs. This removed the lockup happening on
the 120 CPU machine. It also simplifies the code tremendously. What else
could anyone ask for?

Thanks to Peter Zijlstra for simplifying the rto_loop_start atomic logic and
supplying me with the rto_start_trylock() and rto_start_unlock() helper
functions.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Clark Williams <williams@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Wood <swood@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170424114732.1aac6dc4@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c       | 316 ++++++++++++++++++------------------------------
 kernel/sched/sched.h    |  24 ++--
 kernel/sched/topology.c |   6 +
 3 files changed, 138 insertions(+), 208 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0af5ca9e3e3f..fda27991699a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -73,10 +73,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
 void init_rt_rq(struct rt_rq *rt_rq)
 {
 	struct rt_prio_array *array;
@@ -96,13 +92,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
-	rt_rq->push_flags = 0;
-	rt_rq->push_cpu = nr_cpu_ids;
-	raw_spin_lock_init(&rt_rq->push_lock);
-	init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
 #endif /* CONFIG_SMP */
 	/* We start is dequeued state, because no RT tasks are queued */
 	rt_rq->rt_queued = 0;
@@ -1875,241 +1864,166 @@ static void push_rt_tasks(struct rq *rq)
 }
 
 #ifdef HAVE_RT_PUSH_IPI
+
 /*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
  *
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
  */
 static int rto_next_cpu(struct rq *rq)
 {
-	int prev_cpu = rq->rt.push_cpu;
+	struct root_domain *rd = rq->rd;
+	int next;
 	int cpu;
 
-	cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
 	/*
-	 * If the previous cpu is less than the rq's CPU, then it already
-	 * passed the end of the mask, and has started from the beginning.
-	 * We end if the next CPU is greater or equal to rq's CPU.
+	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
+	 * rt_next_cpu() will simply return the first CPU found in
+	 * the rto_mask.
+	 *
+	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+	 * will return the next CPU found in the rto_mask.
+	 *
+	 * If there are no more CPUs left in the rto_mask, then a check is made
+	 * against rto_loop and rto_loop_next. rto_loop is only updated with
+	 * the rto_lock held, but any CPU may increment the rto_loop_next
+	 * without any locking.
 	 */
-	if (prev_cpu < rq->cpu) {
-		if (cpu >= rq->cpu)
-			return nr_cpu_ids;
+	for (;;) {
 
-	} else if (cpu >= nr_cpu_ids) {
-		/*
-		 * We passed the end of the mask, start at the beginning.
-		 * If the result is greater or equal to the rq's CPU, then
-		 * the loop is finished.
-		 */
-		cpu = cpumask_first(rq->rd->rto_mask);
-		if (cpu >= rq->cpu)
-			return nr_cpu_ids;
-	}
-	rq->rt.push_cpu = cpu;
+		/* When rto_cpu is -1 this acts like cpumask_first() */
+		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
 
-	/* Return cpu to let the caller know if the loop is finished or not */
-	return cpu;
-}
+		rd->rto_cpu = cpu;
 
-static int find_next_push_cpu(struct rq *rq)
-{
-	struct rq *next_rq;
-	int cpu;
+		if (cpu < nr_cpu_ids)
+			return cpu;
 
-	while (1) {
-		cpu = rto_next_cpu(rq);
-		if (cpu >= nr_cpu_ids)
-			break;
-		next_rq = cpu_rq(cpu);
+		rd->rto_cpu = -1;
+
+		/*
+		 * ACQUIRE ensures we see the @rto_mask changes
+		 * made prior to the @next value observed.
+		 *
+		 * Matches WMB in rt_set_overload().
+		 */
+		next = atomic_read_acquire(&rd->rto_loop_next);
 
-		/* Make sure the next rq can push to this rq */
-		if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+		if (rd->rto_loop == next)
 			break;
+
+		rd->rto_loop = next;
 	}
 
-	return cpu;
+	return -1;
 }
 
-#define RT_PUSH_IPI_EXECUTING		1
-#define RT_PUSH_IPI_RESTART		2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+	return !atomic_cmpxchg_acquire(v, 0, 1);
+}
 
-/*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
- *
- * On large CPU boxes, there's the case that several CPUs could schedule
- * a lower priority task at the same time, in which case it will look for
- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
- * for a single overloaded CPU's runqueue lock can produce a large latency.
- * (This has actually been observed on large boxes running cyclictest).
- * Instead of taking the runqueue lock of the overloaded CPU, each of the
- * CPUs that scheduled a lower priority task simply sends an IPI to the
- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
- * lots of contention. The overloaded CPU will look to push its non-running
- * RT task off, and if it does, it can then ignore the other IPIs coming
- * in, and just pass those IPIs off to any other overloaded CPU.
- *
- * When a CPU schedules a lower priority task, it only sends an IPI to
- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
- * RT overloaded tasks, would cause 100 IPIs to go out at once.
- *
- * The overloaded RT CPU, when receiving an IPI, will try to push off its
- * overloaded RT tasks and then send an IPI to the next CPU that has
- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
- * have completed. Just because a CPU may have pushed off its own overloaded
- * RT task does not mean it should stop sending the IPI around to other
- * overloaded CPUs. There may be another RT task waiting to run on one of
- * those CPUs that are of higher priority than the one that was just
- * pushed.
- *
- * An optimization that could possibly be made is to make a CPU array similar
- * to the cpupri array mask of all running RT tasks, but for the overloaded
- * case, then the IPI could be sent to only the CPU with the highest priority
- * RT task waiting, and that CPU could send off further IPIs to the CPU with
- * the next highest waiting task. Since the overloaded case is much less likely
- * to happen, the complexity of this implementation may not be worth it.
- * Instead, just send an IPI around to all overloaded CPUs.
- *
- * The rq->rt.push_flags holds the status of the IPI that is going around.
- * A run queue can only send out a single IPI at a time. The possible flags
- * for rq->rt.push_flags are:
- *
- *    (None or zero):		No IPI is going around for the current rq
- *    RT_PUSH_IPI_EXECUTING:	An IPI for the rq is being passed around
- *    RT_PUSH_IPI_RESTART:	The priority of the running task for the rq
- *				has changed, and the IPI should restart
- *				circulating the overloaded CPUs again.
- *
- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
- * before sending to the next CPU.
- *
- * Instead of having all CPUs that schedule a lower priority task send
- * an IPI to the same "first" CPU in the RT overload mask, they send it
- * to the next overloaded CPU after their own CPU. This helps distribute
- * the work when there's more than one overloaded CPU and multiple CPUs
- * scheduling in lower priority tasks.
- *
- * When a rq schedules a lower priority task than what was currently
- * running, the next CPU with overloaded RT tasks is examined first.
- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
- *
- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
- * schedules a lower priority task, and the IPI_RESTART gets set while the
- * handling is being done on CPU 5, it will clear the flag and send it back to
- * CPU 4 instead of CPU 1.
- *
- * Note, the above logic can be disabled by turning off the sched_feature
- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
- * taken by the CPU requesting a pull and the waiting RT task will be pulled
- * by that CPU. This may be fine for machines with few CPUs.
- */
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
 {
-	int cpu;
+	atomic_set_release(v, 0);
+}
 
-	if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-		raw_spin_lock(&rq->rt.push_lock);
-		/* Make sure it's still executing */
-		if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
-			/*
-			 * Tell the IPI to restart the loop as things have
-			 * changed since it started.
-			 */
-			rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
-			raw_spin_unlock(&rq->rt.push_lock);
-			return;
-		}
-		raw_spin_unlock(&rq->rt.push_lock);
-	}
+static void tell_cpu_to_push(struct rq *rq)
+{
+	int cpu = -1;
 
-	/* When here, there's no IPI going around */
+	/* Keep the loop going if the IPI is currently active */
+	atomic_inc(&rq->rd->rto_loop_next);
 
-	rq->rt.push_cpu = rq->cpu;
-	cpu = find_next_push_cpu(rq);
-	if (cpu >= nr_cpu_ids)
+	/* Only one CPU can initiate a loop at a time */
+	if (!rto_start_trylock(&rq->rd->rto_loop_start))
 		return;
 
-	rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+	raw_spin_lock(&rq->rd->rto_lock);
+
+	/*
+	 * The rto_cpu is updated under the lock, if it has a valid cpu
+	 * then the IPI is still running and will continue due to the
+	 * update to loop_next, and nothing needs to be done here.
+	 * Otherwise it is finishing up and an ipi needs to be sent.
+	 */
+	if (rq->rd->rto_cpu < 0)
+		cpu = rto_next_cpu(rq);
 
-	irq_work_queue_on(&rq->rt.push_work, cpu);
+	raw_spin_unlock(&rq->rd->rto_lock);
+
+	rto_start_unlock(&rq->rd->rto_loop_start);
+
+	if (cpu >= 0)
+		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
 }
 
 /* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
 {
-	struct rt_rq *rt_rq = arg;
-	struct rq *rq, *src_rq;
-	int this_cpu;
+	struct rq *rq;
 	int cpu;
 
-	this_cpu = rt_rq->push_cpu;
+	rq = this_rq();
 
-	/* Paranoid check */
-	BUG_ON(this_cpu != smp_processor_id());
-
-	rq = cpu_rq(this_cpu);
-	src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+	/*
+	 * We do not need to grab the lock to check for has_pushable_tasks.
+	 * When it gets updated, a check is made if a push is possible.
+	 */
 	if (has_pushable_tasks(rq)) {
 		raw_spin_lock(&rq->lock);
-		push_rt_task(rq);
+		push_rt_tasks(rq);
 		raw_spin_unlock(&rq->lock);
 	}
 
-	/* Pass the IPI to the next rt overloaded queue */
-	raw_spin_lock(&rt_rq->push_lock);
-	/*
-	 * If the source queue changed since the IPI went out,
-	 * we need to restart the search from that CPU again.
-	 */
-	if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
-		rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
-		rt_rq->push_cpu = src_rq->cpu;
-	}
+	raw_spin_lock(&rq->rd->rto_lock);
 
-	cpu = find_next_push_cpu(src_rq);
+	/* Pass the IPI to the next rt overloaded queue */
+	cpu = rto_next_cpu(rq);
 
-	if (cpu >= nr_cpu_ids)
-		rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
-	raw_spin_unlock(&rt_rq->push_lock);
+	raw_spin_unlock(&rq->rd->rto_lock);
 
-	if (cpu >= nr_cpu_ids)
+	if (cpu < 0)
 		return;
 
-	/*
-	 * It is possible that a restart caused this CPU to be
-	 * chosen again. Don't bother with an IPI, just see if we
-	 * have more to push.
-	 */
-	if (unlikely(cpu == rq->cpu))
-		goto again;
-
 	/* Try the next RT overloaded CPU */
-	irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
-	struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
-	try_to_push_tasks(rt_rq);
+	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a81c9782e98c..8aa24b41f652 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -505,7 +505,7 @@ static inline int rt_bandwidth_enabled(void)
 }
 
 /* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
 # define HAVE_RT_PUSH_IPI
 #endif
 
@@ -527,12 +527,6 @@ struct rt_rq {
 	unsigned long rt_nr_total;
 	int overloaded;
 	struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
-	int push_flags;
-	int push_cpu;
-	struct irq_work push_work;
-	raw_spinlock_t push_lock;
-#endif
 #endif /* CONFIG_SMP */
 	int rt_queued;
 
@@ -641,6 +635,19 @@ struct root_domain {
 	struct dl_bw dl_bw;
 	struct cpudl cpudl;
 
+#ifdef HAVE_RT_PUSH_IPI
+	/*
+	 * For IPI pull requests, loop across the rto_mask.
+	 */
+	struct irq_work rto_push_work;
+	raw_spinlock_t rto_lock;
+	/* These are only updated and read within rto_lock */
+	int rto_loop;
+	int rto_cpu;
+	/* These atomics are updated outside of a lock */
+	atomic_t rto_loop_next;
+	atomic_t rto_loop_start;
+#endif
 	/*
 	 * The "RT overload" flag: it gets set if a CPU has more than
 	 * one runnable RT task.
@@ -658,6 +665,9 @@ extern void init_defrootdomain(void);
 extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f51d123f9fe1..e50450c2fed8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -268,6 +268,12 @@ static int init_rootdomain(struct root_domain *rd)
 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
 		goto free_dlo_mask;
 
+#ifdef HAVE_RT_PUSH_IPI
+	rd->rto_cpu = -1;
+	raw_spin_lock_init(&rd->rto_lock);
+	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
 	init_dl_bw(&rd->dl_bw);
 	if (cpudl_init(&rd->cpudl) != 0)
 		goto free_rto_mask;
-- 
cgit v1.3-14-g43fede


From 76f8507f7a6442215df19de74f07eabca2462f1e Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 29 Sep 2017 19:06:38 +0300
Subject: locking/rwsem: Add down_read_killable()

Similar to down_read() and down_write_killable(),
add killable version of down_read(), based on
__down_read_killable() function, added in previous
patches.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arnd@arndb.de
Cc: avagin@virtuozzo.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: gorcunov@virtuozzo.com
Cc: heiko.carstens@de.ibm.com
Cc: hpa@zytor.com
Cc: ink@jurassic.park.msu.ru
Cc: mattst88@gmail.com
Cc: rientjes@google.com
Cc: rth@twiddle.net
Cc: schwidefsky@de.ibm.com
Cc: tony.luck@intel.com
Cc: viro@zeniv.linux.org.uk
Link: http://lkml.kernel.org/r/150670119884.23930.2585570605960763239.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/rwsem.h | 10 ++++++++++
 include/linux/rwsem.h       |  1 +
 kernel/locking/rwsem.c      | 16 ++++++++++++++++
 3 files changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/include/asm-generic/rwsem.h b/include/asm-generic/rwsem.h
index 6c6a2141f271..b2d68d2d198e 100644
--- a/include/asm-generic/rwsem.h
+++ b/include/asm-generic/rwsem.h
@@ -37,6 +37,16 @@ static inline void __down_read(struct rw_semaphore *sem)
 		rwsem_down_read_failed(sem);
 }
 
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+	if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+		if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+			return -EINTR;
+	}
+
+	return 0;
+}
+
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
 	long tmp;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 0ad7318ff299..6ac8ee5f15dd 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -111,6 +111,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem)
  * lock for reading
  */
 extern void down_read(struct rw_semaphore *sem);
+extern int __must_check down_read_killable(struct rw_semaphore *sem);
 
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 4d48b1c4870d..e53f7746d9fd 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -28,6 +28,22 @@ void __sched down_read(struct rw_semaphore *sem)
 
 EXPORT_SYMBOL(down_read);
 
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+	might_sleep();
+	rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+	if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+		rwsem_release(&sem->dep_map, 1, _RET_IP_);
+		return -EINTR;
+	}
+
+	rwsem_set_reader_owned(sem);
+	return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-- 
cgit v1.3-14-g43fede


From a8a217c22116eff6c120d753c9934089fb229af0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 3 Oct 2017 19:25:27 +0100
Subject: locking/core: Remove {read,spin,write}_can_lock()

Outside of the locking code itself, {read,spin,write}_can_lock() have no
users in tree. Apparmor (the last remaining user of write_can_lock()) got
moved over to lockdep by the previous patch.

This patch removes the use of {read,spin,write}_can_lock() from the
BUILD_LOCK_OPS macro, deferring to the trylock operation for testing the
lock status, and subsequently removes the unused macros altogether. They
aren't guaranteed to work in a concurrent environment and can give
incorrect results in the case of qrwlock.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1507055129-12300-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/alpha/include/asm/spinlock.h        | 10 ----------
 arch/arc/include/asm/spinlock.h          |  3 ---
 arch/arm/include/asm/spinlock.h          |  6 ------
 arch/blackfin/include/asm/spinlock.h     | 10 ----------
 arch/hexagon/include/asm/spinlock.h      | 10 ----------
 arch/ia64/include/asm/spinlock.h         |  3 ---
 arch/m32r/include/asm/spinlock.h         | 12 ------------
 arch/metag/include/asm/spinlock_lnkget.h | 30 ------------------------------
 arch/metag/include/asm/spinlock_lock1.h  | 20 --------------------
 arch/mn10300/include/asm/spinlock.h      | 12 ------------
 arch/parisc/include/asm/spinlock.h       | 18 ------------------
 arch/powerpc/include/asm/spinlock.h      |  3 ---
 arch/s390/include/asm/spinlock.h         | 12 ------------
 arch/sh/include/asm/spinlock-cas.h       | 12 ------------
 arch/sh/include/asm/spinlock-llsc.h      | 12 ------------
 arch/sparc/include/asm/spinlock_32.h     |  3 ---
 arch/tile/include/asm/spinlock_32.h      | 16 ----------------
 arch/tile/include/asm/spinlock_64.h      | 18 ------------------
 arch/xtensa/include/asm/spinlock.h       |  2 --
 include/asm-generic/qrwlock.h            | 20 --------------------
 include/linux/rwlock.h                   |  3 ---
 include/linux/spinlock.h                 | 11 -----------
 include/linux/spinlock_up.h              |  3 ---
 kernel/locking/spinlock.c                |  6 ++----
 24 files changed, 2 insertions(+), 253 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index 718ac0b64adf..7bff6316b8bb 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -54,16 +54,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 
 /***********************************************************/
 
-static inline int arch_read_can_lock(arch_rwlock_t *lock)
-{
-	return (lock->lock & 1) == 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *lock)
-{
-	return lock->lock == 0;
-}
-
 static inline void arch_read_lock(arch_rwlock_t *lock)
 {
 	long regx;
diff --git a/arch/arc/include/asm/spinlock.h b/arch/arc/include/asm/spinlock.h
index 47efc8451b70..ce9bfcf1d870 100644
--- a/arch/arc/include/asm/spinlock.h
+++ b/arch/arc/include/asm/spinlock.h
@@ -410,9 +410,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 
 #endif
 
-#define arch_read_can_lock(x)	((x)->counter > 0)
-#define arch_write_can_lock(x)	((x)->counter == __ARCH_RW_LOCK_UNLOCKED__)
-
 #define arch_read_lock_flags(lock, flags)	arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags)	arch_write_lock(lock)
 
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index c030143c18c6..f5223260c73e 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -193,9 +193,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	dsb_sev();
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-#define arch_write_can_lock(x)		(ACCESS_ONCE((x)->lock) == 0)
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -273,9 +270,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	}
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-#define arch_read_can_lock(x)		(ACCESS_ONCE((x)->lock) < 0x80000000)
-
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/blackfin/include/asm/spinlock.h b/arch/blackfin/include/asm/spinlock.h
index f6431439d15d..607ef98c0f6c 100644
--- a/arch/blackfin/include/asm/spinlock.h
+++ b/arch/blackfin/include/asm/spinlock.h
@@ -48,16 +48,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 	__raw_spin_unlock_asm(&lock->lock);
 }
 
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return __raw_uncached_fetch_asm(&rw->lock) > 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return __raw_uncached_fetch_asm(&rw->lock) == RW_LOCK_BIAS;
-}
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	__raw_read_lock_asm(&rw->lock);
diff --git a/arch/hexagon/include/asm/spinlock.h b/arch/hexagon/include/asm/spinlock.h
index 53a8d5885887..9f9414b9c303 100644
--- a/arch/hexagon/include/asm/spinlock.h
+++ b/arch/hexagon/include/asm/spinlock.h
@@ -86,16 +86,6 @@ static inline int arch_read_trylock(arch_rwlock_t *lock)
 	return temp;
 }
 
-static inline int arch_read_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
-static inline int arch_write_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
 /*  Stuffs a -1 in the lock value?  */
 static inline void arch_write_lock(arch_rwlock_t *lock)
 {
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index df2c121164b8..c728dda59bd2 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -127,9 +127,6 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
 	arch_spin_lock(lock);
 }
 
-#define arch_read_can_lock(rw)		(*(volatile int *)(rw) >= 0)
-#define arch_write_can_lock(rw)	(*(volatile int *)(rw) == 0)
-
 #ifdef ASM_SUPPORTED
 
 static __always_inline void
diff --git a/arch/m32r/include/asm/spinlock.h b/arch/m32r/include/asm/spinlock.h
index a56825592b90..002601371b2f 100644
--- a/arch/m32r/include/asm/spinlock.h
+++ b/arch/m32r/include/asm/spinlock.h
@@ -137,18 +137,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  * semaphore.h for details.  -ben
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp0, tmp1;
diff --git a/arch/metag/include/asm/spinlock_lnkget.h b/arch/metag/include/asm/spinlock_lnkget.h
index ad8436feed8d..6a932a952d53 100644
--- a/arch/metag/include/asm/spinlock_lnkget.h
+++ b/arch/metag/include/asm/spinlock_lnkget.h
@@ -136,21 +136,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 		      : "memory");
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	int ret;
-
-	asm volatile ("LNKGETD	%0, [%1]\n"
-		      "CMP	%0, #0\n"
-		      "MOV	%0, #1\n"
-		      "XORNZ     %0, %0, %0\n"
-		      : "=&d" (ret)
-		      : "da" (&rw->lock)
-		      : "cc");
-	return ret;
-}
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -224,21 +209,6 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	return tmp;
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	int tmp;
-
-	asm volatile ("LNKGETD	%0, [%1]\n"
-		      "CMP	%0, %2\n"
-		      "MOV	%0, #1\n"
-		      "XORZ	%0, %0, %0\n"
-		      : "=&d" (tmp)
-		      : "da" (&rw->lock), "bd" (0x80000000)
-		      : "cc");
-	return tmp;
-}
-
 #define	arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define	arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/metag/include/asm/spinlock_lock1.h b/arch/metag/include/asm/spinlock_lock1.h
index c630444cffe9..8ae12bfc8ad8 100644
--- a/arch/metag/include/asm/spinlock_lock1.h
+++ b/arch/metag/include/asm/spinlock_lock1.h
@@ -104,16 +104,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 	rw->lock = 0;
 }
 
-/* write_can_lock - would write_trylock() succeed? */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	unsigned int ret;
-
-	barrier();
-	ret = rw->lock;
-	return (ret == 0);
-}
-
 /*
  * Read locks are a bit more hairy:
  *  - Exclusively load the lock value.
@@ -171,14 +161,4 @@ static inline int arch_read_trylock(arch_rwlock_t *rw)
 	return (ret < 0x80000000);
 }
 
-/* read_can_lock - would read_trylock() succeed? */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	unsigned int ret;
-
-	barrier();
-	ret = rw->lock;
-	return (ret < 0x80000000);
-}
-
 #endif /* __ASM_SPINLOCK_LOCK1_H */
diff --git a/arch/mn10300/include/asm/spinlock.h b/arch/mn10300/include/asm/spinlock.h
index fe413b41df6c..54f75dac8094 100644
--- a/arch/mn10300/include/asm/spinlock.h
+++ b/arch/mn10300/include/asm/spinlock.h
@@ -98,18 +98,6 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lock,
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == RW_LOCK_BIAS)
-
 /*
  * On mn10300, we implement read-write locks as a 32-bit counter
  * with the high bit (sign) being the "contended" bit.
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 55bfe4affca3..136e1c9bb8a9 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -168,24 +168,6 @@ static __inline__ int arch_write_trylock(arch_rwlock_t *rw)
 	return result;
 }
 
-/*
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static __inline__ int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return rw->counter >= 0;
-}
-
-/*
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static __inline__ int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return !rw->counter;
-}
-
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index edbe571bcc54..d83f4f755ad8 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -181,9 +181,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-#define arch_read_can_lock(rw)		((rw)->lock >= 0)
-#define arch_write_can_lock(rw)	(!(rw)->lock)
-
 #ifdef CONFIG_PPC64
 #define __DO_SIGN_EXTEND	"extsw	%0,%0\n"
 #define WRLOCK_TOKEN		LOCK_TOKEN	/* it's negative */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 8182b521c42f..dc9c58ed9e4c 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -110,18 +110,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x) ((int)(x)->lock >= 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x) ((x)->lock == 0)
-
 extern int _raw_read_trylock_retry(arch_rwlock_t *lp);
 extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
diff --git a/arch/sh/include/asm/spinlock-cas.h b/arch/sh/include/asm/spinlock-cas.h
index 5ed7dbbd94ff..315467834521 100644
--- a/arch/sh/include/asm/spinlock-cas.h
+++ b/arch/sh/include/asm/spinlock-cas.h
@@ -53,18 +53,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x)	((x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x)	((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned old;
diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h
index f77263aae760..06be4a55f3c4 100644
--- a/arch/sh/include/asm/spinlock-llsc.h
+++ b/arch/sh/include/asm/spinlock-llsc.h
@@ -89,18 +89,6 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
  * read-locks.
  */
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_read_can_lock(x)	((x)->lock > 0)
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-#define arch_write_can_lock(x)	((x)->lock == RW_LOCK_BIAS)
-
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp;
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index 67345b2dc408..76986b8f7dad 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -190,9 +190,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw)
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
 
-#define arch_read_can_lock(rw) (!((rw)->lock & 0xff))
-#define arch_write_can_lock(rw) (!(rw)->lock)
-
 #endif /* !(__ASSEMBLY__) */
 
 #endif /* __SPARC_SPINLOCK_H */
diff --git a/arch/tile/include/asm/spinlock_32.h b/arch/tile/include/asm/spinlock_32.h
index cba8ba9b8da6..91d05f21cba9 100644
--- a/arch/tile/include/asm/spinlock_32.h
+++ b/arch/tile/include/asm/spinlock_32.h
@@ -79,22 +79,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 #define _RD_COUNT_SHIFT 24
 #define _RD_COUNT_WIDTH 8
 
-/**
- * arch_read_can_lock() - would read_trylock() succeed?
- */
-static inline int arch_read_can_lock(arch_rwlock_t *rwlock)
-{
-	return (rwlock->lock << _RD_COUNT_WIDTH) == 0;
-}
-
-/**
- * arch_write_can_lock() - would write_trylock() succeed?
- */
-static inline int arch_write_can_lock(arch_rwlock_t *rwlock)
-{
-	return rwlock->lock == 0;
-}
-
 /**
  * arch_read_lock() - acquire a read lock.
  */
diff --git a/arch/tile/include/asm/spinlock_64.h b/arch/tile/include/asm/spinlock_64.h
index 9a2c2d605752..c802f48badf4 100644
--- a/arch/tile/include/asm/spinlock_64.h
+++ b/arch/tile/include/asm/spinlock_64.h
@@ -93,24 +93,6 @@ static inline int arch_write_val_locked(int val)
 	return val < 0;  /* Optimize "val & __WRITE_LOCK_BIT". */
 }
 
-/**
- * read_can_lock - would read_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_read_can_lock(arch_rwlock_t *rw)
-{
-	return !arch_write_val_locked(rw->lock);
-}
-
-/**
- * write_can_lock - would write_trylock() succeed?
- * @lock: the rwlock in question.
- */
-static inline int arch_write_can_lock(arch_rwlock_t *rw)
-{
-	return rw->lock == 0;
-}
-
 extern void __read_lock_failed(arch_rwlock_t *rw);
 
 static inline void arch_read_lock(arch_rwlock_t *rw)
diff --git a/arch/xtensa/include/asm/spinlock.h b/arch/xtensa/include/asm/spinlock.h
index 3bb49681ee24..d005af51e2e1 100644
--- a/arch/xtensa/include/asm/spinlock.h
+++ b/arch/xtensa/include/asm/spinlock.h
@@ -97,8 +97,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
  *  0x80000000  one writer owns the rwlock, no other writers, no readers
  */
 
-#define arch_write_can_lock(x)  ((x)->lock == 0)
-
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
 	unsigned long tmp;
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 7d026bf27713..50925327b0a8 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -52,24 +52,6 @@
 extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts);
 extern void queued_write_lock_slowpath(struct qrwlock *lock);
 
-/**
- * queued_read_can_lock- would read_trylock() succeed?
- * @lock: Pointer to queue rwlock structure
- */
-static inline int queued_read_can_lock(struct qrwlock *lock)
-{
-	return !(atomic_read(&lock->cnts) & _QW_WMASK);
-}
-
-/**
- * queued_write_can_lock- would write_trylock() succeed?
- * @lock: Pointer to queue rwlock structure
- */
-static inline int queued_write_can_lock(struct qrwlock *lock)
-{
-	return !atomic_read(&lock->cnts);
-}
-
 /**
  * queued_read_trylock - try to acquire read lock of a queue rwlock
  * @lock : Pointer to queue rwlock structure
@@ -169,8 +151,6 @@ static inline void queued_write_unlock(struct qrwlock *lock)
  * Remapping rwlock architecture specific functions to the corresponding
  * queue rwlock functions.
  */
-#define arch_read_can_lock(l)	queued_read_can_lock(l)
-#define arch_write_can_lock(l)	queued_write_can_lock(l)
 #define arch_read_lock(l)	queued_read_lock(l)
 #define arch_write_lock(l)	queued_write_lock(l)
 #define arch_read_trylock(l)	queued_read_trylock(l)
diff --git a/include/linux/rwlock.h b/include/linux/rwlock.h
index bc2994ed66e1..766c5ca5cbd1 100644
--- a/include/linux/rwlock.h
+++ b/include/linux/rwlock.h
@@ -50,9 +50,6 @@ do {								\
 # define do_raw_write_unlock(rwlock)	do {arch_write_unlock(&(rwlock)->raw_lock); __release(lock); } while (0)
 #endif
 
-#define read_can_lock(rwlock)		arch_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		arch_write_can_lock(&(rwlock)->raw_lock)
-
 /*
  * Define the various rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 69e079c5ff98..1e3e48041800 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -278,12 +278,6 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
 	1 : ({ local_irq_restore(flags); 0; }); \
 })
 
-/**
- * raw_spin_can_lock - would raw_spin_trylock() succeed?
- * @lock: the spinlock in question.
- */
-#define raw_spin_can_lock(lock)	(!raw_spin_is_locked(lock))
-
 /* Include rwlock functions */
 #include <linux/rwlock.h>
 
@@ -396,11 +390,6 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static __always_inline int spin_can_lock(spinlock_t *lock)
-{
-	return raw_spin_can_lock(&lock->rlock);
-}
-
 #define assert_spin_locked(lock)	assert_raw_spin_locked(&(lock)->rlock)
 
 /*
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 612fb530af41..901cf8f44388 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -77,7 +77,4 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #define arch_spin_is_contended(lock)	(((void)(lock), 0))
 
-#define arch_read_can_lock(lock)	(((void)(lock), 1))
-#define arch_write_can_lock(lock)	(((void)(lock), 1))
-
 #endif /* __LINUX_SPINLOCK_UP_H */
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 4b082b5cac9e..8fd48b5552a7 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -32,8 +32,6 @@
  * include/linux/spinlock_api_smp.h
  */
 #else
-#define raw_read_can_lock(l)	read_can_lock(l)
-#define raw_write_can_lock(l)	write_can_lock(l)
 
 /*
  * Some architectures can relax in favour of the CPU owning the lock.
@@ -68,7 +66,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock)			\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+		while ((lock)->break_lock)				\
 			arch_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
@@ -88,7 +86,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock)	\
 									\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+		while ((lock)->break_lock)				\
 			arch_##op##_relax(&lock->raw_lock);		\
 	}								\
 	(lock)->break_lock = 0;						\
-- 
cgit v1.3-14-g43fede


From de8cd83e91bc3ee212b3e6ec6e4283af9e4ab269 Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Mon, 2 Oct 2017 20:21:39 -0400
Subject: audit: Record fanotify access control decisions

The fanotify interface allows user space daemons to make access
control decisions. Under common criteria requirements, we need to
optionally record decisions based on policy. This patch adds a bit mask,
FAN_AUDIT, that a user space daemon can 'or' into the response decision
which will tell the kernel that it made a decision and record it.

It would be used something like this in user space code:

  response.response = FAN_DENY | FAN_AUDIT;
  write(fd, &response, sizeof(struct fanotify_response));

When the syscall ends, the audit system will record the decision as a
AUDIT_FANOTIFY auxiliary record to denote that the reason this event
occurred is the result of an access control decision from fanotify
rather than DAC or MAC policy.

A sample event looks like this:

type=PATH msg=audit(1504310584.332:290): item=0 name="./evil-ls"
inode=1319561 dev=fc:03 mode=0100755 ouid=1000 ogid=1000 rdev=00:00
obj=unconfined_u:object_r:user_home_t:s0 nametype=NORMAL
type=CWD msg=audit(1504310584.332:290): cwd="/home/sgrubb"
type=SYSCALL msg=audit(1504310584.332:290): arch=c000003e syscall=2
success=no exit=-1 a0=32cb3fca90 a1=0 a2=43 a3=8 items=1 ppid=901
pid=959 auid=1000 uid=1000 gid=1000 euid=1000 suid=1000
fsuid=1000 egid=1000 sgid=1000 fsgid=1000 tty=pts1 ses=3 comm="bash"
exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:
s0-s0:c0.c1023 key=(null)
type=FANOTIFY msg=audit(1504310584.332:290): resp=2

Prior to using the audit flag, the developer needs to call
fanotify_init or'ing in FAN_ENABLE_AUDIT to ensure that the kernel
supports auditing. The calling process must also have the CAP_AUDIT_WRITE
capability.

Signed-off-by: sgrubb <sgrubb@redhat.com>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/fanotify/fanotify.c      |  8 +++++++-
 fs/notify/fanotify/fanotify_user.c | 16 +++++++++++++++-
 fs/notify/fdinfo.c                 |  3 +++
 include/linux/audit.h              | 10 ++++++++++
 include/linux/fsnotify_backend.h   |  1 +
 include/uapi/linux/audit.h         |  1 +
 include/uapi/linux/fanotify.h      |  3 +++
 kernel/auditsc.c                   |  6 ++++++
 8 files changed, 46 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 2fa99aeaa095..1968d21a3f37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,6 +9,7 @@
 #include <linux/sched/user.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/audit.h>
 
 #include "fanotify.h"
 
@@ -78,7 +79,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
 	fsnotify_finish_user_wait(iter_info);
 out:
 	/* userspace responded, convert to something usable */
-	switch (event->response) {
+	switch (event->response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 		ret = 0;
 		break;
@@ -86,6 +87,11 @@ out:
 	default:
 		ret = -EPERM;
 	}
+
+	/* Check if the response should be audited */
+	if (event->response & FAN_AUDIT)
+		audit_fanotify(event->response & ~FAN_AUDIT);
+
 	event->response = 0;
 
 	pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 907a481ac781..0455ea729384 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -179,7 +179,7 @@ static int process_access_response(struct fsnotify_group *group,
 	 * userspace can send a valid response or we will clean it up after the
 	 * timeout
 	 */
-	switch (response) {
+	switch (response & ~FAN_AUDIT) {
 	case FAN_ALLOW:
 	case FAN_DENY:
 		break;
@@ -190,6 +190,9 @@ static int process_access_response(struct fsnotify_group *group,
 	if (fd < 0)
 		return -EINVAL;
 
+	if ((response & FAN_AUDIT) && !group->fanotify_data.audit)
+		return -EINVAL;
+
 	event = dequeue_event(group, fd);
 	if (!event)
 		return -ENOENT;
@@ -721,7 +724,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_AUDITSYSCALL
+	if (flags & ~(FAN_ALL_INIT_FLAGS | FAN_ENABLE_AUDIT))
+#else
 	if (flags & ~FAN_ALL_INIT_FLAGS)
+#endif
 		return -EINVAL;
 
 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
@@ -805,6 +812,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
 	}
 
+	if (flags & FAN_ENABLE_AUDIT) {
+		fd = -EPERM;
+		if (!capable(CAP_AUDIT_WRITE))
+			goto out_destroy_group;
+		group->fanotify_data.audit = true;
+	}
+
 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
 	if (fd < 0)
 		goto out_destroy_group;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index dd63aa9a6f9a..645ab561e790 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -156,6 +156,9 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
 	if (group->fanotify_data.max_marks == UINT_MAX)
 		flags |= FAN_UNLIMITED_MARKS;
 
+	if (group->fanotify_data.audit)
+		flags |= FAN_ENABLE_AUDIT;
+
 	seq_printf(m, "fanotify flags:%x event-flags:%x\n",
 		   flags, group->fanotify_data.f_flags);
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
index cb708eb8accc..d66220dac364 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -356,6 +356,7 @@ extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
 extern void __audit_log_capset(const struct cred *new, const struct cred *old);
 extern void __audit_mmap_fd(int fd, int flags);
 extern void __audit_log_kern_module(char *name);
+extern void __audit_fanotify(unsigned int response);
 
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
@@ -452,6 +453,12 @@ static inline void audit_log_kern_module(char *name)
 		__audit_log_kern_module(name);
 }
 
+static inline void audit_fanotify(unsigned int response)
+{
+	if (!audit_dummy_context())
+		__audit_fanotify(response);
+}
+
 extern int audit_n_rules;
 extern int audit_signals;
 #else /* CONFIG_AUDITSYSCALL */
@@ -568,6 +575,9 @@ static inline void audit_log_kern_module(char *name)
 {
 }
 
+static inline void audit_fanotify(unsigned int response)
+{ }
+
 static inline void audit_ptrace(struct task_struct *t)
 { }
 #define audit_n_rules 0
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index c6c69318752b..4a474f972910 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -190,6 +190,7 @@ struct fsnotify_group {
 			int f_flags;
 			unsigned int max_marks;
 			struct user_struct *user;
+			bool audit;
 		} fanotify_data;
 #endif /* CONFIG_FANOTIFY */
 	};
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..221f8b7f01b2 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -112,6 +112,7 @@
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
 #define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 #define AUDIT_KERN_MODULE	1330	/* Kernel Module events */
+#define AUDIT_FANOTIFY		1331	/* Fanotify access decision */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 030508d195d3..5dda19a9a947 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -35,6 +35,7 @@
 
 #define FAN_UNLIMITED_QUEUE	0x00000010
 #define FAN_UNLIMITED_MARKS	0x00000020
+#define FAN_ENABLE_AUDIT	0x00000040
 
 #define FAN_ALL_INIT_FLAGS	(FAN_CLOEXEC | FAN_NONBLOCK | \
 				 FAN_ALL_CLASS_BITS | FAN_UNLIMITED_QUEUE |\
@@ -99,6 +100,8 @@ struct fanotify_response {
 /* Legit userspace responses to a _PERM event */
 #define FAN_ALLOW	0x01
 #define FAN_DENY	0x02
+#define FAN_AUDIT	0x10	/* Bit mask to create audit record for result */
+
 /* No fd set in event */
 #define FAN_NOFD	-1
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..9c723e978245 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2390,6 +2390,12 @@ void __audit_log_kern_module(char *name)
 	context->type = AUDIT_KERN_MODULE;
 }
 
+void __audit_fanotify(unsigned int response)
+{
+	audit_log(current->audit_context, GFP_KERNEL,
+		AUDIT_FANOTIFY,	"resp=%u", response);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
-- 
cgit v1.3-14-g43fede


From e7bf8249e8f1bac64885eeccb55bcf6111901a81 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:10 -0700
Subject: bpf: encapsulate verifier log state into a structure

Put the loose log_* variables into a structure.  This will make
it simpler to remove the global verifier state in following patches.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h | 13 ++++++++++
 kernel/bpf/verifier.c        | 57 +++++++++++++++++++++++---------------------
 2 files changed, 43 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b8d200f60a40..163541ba70d9 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,6 +115,19 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+struct bpf_verifer_log {
+	u32 level;
+	char *kbuf;
+	char __user *ubuf;
+	u32 len_used;
+	u32 len_total;
+};
+
+static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log)
+{
+	return log->len_used >= log->len_total - 1;
+}
+
 struct bpf_verifier_env;
 struct bpf_ext_analyzer_ops {
 	int (*insn_hook)(struct bpf_verifier_env *env,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6352a88ca6d1..e53458b02249 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,8 +156,7 @@ struct bpf_call_arg_meta {
 /* verbose verifier prints what it's seeing
  * bpf_check() is called under lock, so no race to access these global vars
  */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
+static struct bpf_verifer_log verifier_log;
 
 static DEFINE_MUTEX(bpf_verifier_lock);
 
@@ -167,13 +166,15 @@ static DEFINE_MUTEX(bpf_verifier_lock);
  */
 static __printf(1, 2) void verbose(const char *fmt, ...)
 {
+	struct bpf_verifer_log *log = &verifier_log;
 	va_list args;
 
-	if (log_level == 0 || log_len >= log_size - 1)
+	if (!log->level || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	log->len_used += vscnprintf(log->kbuf + log->len_used,
+				    log->len_total - log->len_used, fmt, args);
 	va_end(args);
 }
 
@@ -886,7 +887,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
@@ -2956,7 +2957,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
 		return -EACCES;
 	}
-	if (log_level)
+	if (verifier_log.level)
 		print_verifier_state(this_branch);
 	return 0;
 }
@@ -3712,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (log_level) {
+			if (verifier_log.level) {
 				if (do_print_state)
 					verbose("\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
@@ -3725,8 +3726,9 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (log_level > 1 || (log_level && do_print_state)) {
-			if (log_level > 1)
+		if (verifier_log.level > 1 ||
+		    (verifier_log.level && do_print_state)) {
+			if (verifier_log.level > 1)
 				verbose("%d:", insn_idx);
 			else
 				verbose("\nfrom %d to %d:",
@@ -3735,7 +3737,7 @@ static int do_check(struct bpf_verifier_env *env)
 			do_print_state = false;
 		}
 
-		if (log_level) {
+		if (verifier_log.level) {
 			verbose("%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
@@ -4389,7 +4391,7 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	char __user *log_ubuf = NULL;
+	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
 	int ret = -EINVAL;
 
@@ -4414,23 +4416,23 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log_level = attr->log_level;
-		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log_size = attr->log_size;
-		log_len = 0;
+		log->level = attr->log_level;
+		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log->len_total = attr->log_size;
+		log->len_used = 0;
 
 		ret = -EINVAL;
-		/* log_* values have to be sane */
-		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-		    log_level == 0 || log_ubuf == NULL)
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf)
 			goto err_unlock;
 
 		ret = -ENOMEM;
-		log_buf = vmalloc(log_size);
-		if (!log_buf)
+		log->kbuf = vmalloc(log->len_total);
+		if (!log->kbuf)
 			goto err_unlock;
 	} else {
-		log_level = 0;
+		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4467,15 +4469,16 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log_level && log_len >= log_size - 1) {
-		BUG_ON(log_len >= log_size);
+	if (log->level && bpf_verifier_log_full(log)) {
+		BUG_ON(log->len_used >= log->len_total);
 		/* verifier log exceeded user supplied buffer */
 		ret = -ENOSPC;
 		/* fall through to return what was recorded */
 	}
 
 	/* copy verifier log back to user space including trailing zero */
-	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+	if (log->level && copy_to_user(log->ubuf, log->kbuf,
+				       log->len_used + 1) != 0) {
 		ret = -EFAULT;
 		goto free_log_buf;
 	}
@@ -4502,8 +4505,8 @@ skip_full_check:
 	}
 
 free_log_buf:
-	if (log_level)
-		vfree(log_buf);
+	if (log->level)
+		vfree(log->kbuf);
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4540,7 +4543,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	log_level = 0;
+	verifier_log.level = 0;
 
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-- 
cgit v1.3-14-g43fede


From 61bd5218eef349fcacc4976a251bc83a4748b4af Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:11 -0700
Subject: bpf: move global verifier log into verifier environment

The biggest piece of global state protected by the verifier lock
is the verifier_log.  Move that log to struct bpf_verifier_env.
struct bpf_verifier_env has to be passed now to all invocations
of verbose().

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |   2 +
 kernel/bpf/verifier.c        | 491 +++++++++++++++++++++++--------------------
 2 files changed, 261 insertions(+), 232 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 163541ba70d9..5ddb9a626a51 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,6 +152,8 @@ struct bpf_verifier_env {
 	bool allow_ptr_leaks;
 	bool seen_direct_write;
 	struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */
+
+	struct bpf_verifer_log log;
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e53458b02249..a352f93cd4b2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,20 +153,16 @@ struct bpf_call_arg_meta {
 	int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static struct bpf_verifer_log verifier_log;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
 {
-	struct bpf_verifer_log *log = &verifier_log;
+	struct bpf_verifer_log *log = &env->log;
 	va_list args;
 
 	if (!log->level || bpf_verifier_log_full(log))
@@ -214,7 +210,8 @@ static const char *func_id_name(int id)
 		return "unknown";
 }
 
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -225,21 +222,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
-		verbose(" R%d=%s", i, reg_type_str[t]);
+		verbose(env, " R%d=%s", i, reg_type_str[t]);
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose("%lld", reg->var_off.value + reg->off);
+			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			verbose("(id=%d", reg->id);
+			verbose(env, "(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
-				verbose(",off=%d", reg->off);
+				verbose(env, ",off=%d", reg->off);
 			if (type_is_pkt_pointer(t))
-				verbose(",r=%d", reg->range);
+				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
-				verbose(",ks=%d,vs=%d",
+				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
@@ -247,38 +244,38 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(",imm=%llx", reg->var_off.value);
+				verbose(env, ",imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(",smin_value=%lld",
+					verbose(env, ",smin_value=%lld",
 						(long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(",smax_value=%lld",
+					verbose(env, ",smax_value=%lld",
 						(long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(",umin_value=%llu",
+					verbose(env, ",umin_value=%llu",
 						(unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(",umax_value=%llu",
+					verbose(env, ",umax_value=%llu",
 						(unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(",var_off=%s", tn_buf);
+					verbose(env, ",var_off=%s", tn_buf);
 				}
 			}
-			verbose(")");
+			verbose(env, ")");
 		}
 	}
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
 				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
 	}
-	verbose("\n");
+	verbose(env, "\n");
 }
 
 static const char *const bpf_class_string[] = {
@@ -333,15 +330,15 @@ static const char *const bpf_jmp_string[16] = {
 	[BPF_EXIT >> 4] = "exit",
 };
 
-static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+static void print_bpf_end_insn(struct bpf_verifier_env *env,
 			       const struct bpf_insn *insn)
 {
-	verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
 		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
 		insn->imm, insn->dst_reg);
 }
 
-static void print_bpf_insn(const struct bpf_verifier_env *env,
+static void print_bpf_insn(struct bpf_verifier_env *env,
 			   const struct bpf_insn *insn)
 {
 	u8 class = BPF_CLASS(insn->code);
@@ -349,23 +346,23 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 	if (class == BPF_ALU || class == BPF_ALU64) {
 		if (BPF_OP(insn->code) == BPF_END) {
 			if (class == BPF_ALU64)
-				verbose("BUG_alu64_%02x\n", insn->code);
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
 			else
 				print_bpf_end_insn(env, insn);
 		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose("(%02x) r%d = %s-r%d\n",
+			verbose(env, "(%02x) r%d = %s-r%d\n",
 				insn->code, insn->dst_reg,
 				class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) %sr%d %s %sr%d\n",
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
 				class == BPF_ALU ? "(u32) " : "",
 				insn->src_reg);
 		} else {
-			verbose("(%02x) %sr%d %s %s%d\n",
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
 				insn->code, class == BPF_ALU ? "(u32) " : "",
 				insn->dst_reg,
 				bpf_alu_string[BPF_OP(insn->code) >> 4],
@@ -374,46 +371,46 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 		}
 	} else if (class == BPF_STX) {
 		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg,
 				insn->off, insn->src_reg);
 		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->dst_reg, insn->off,
 				insn->src_reg);
 		else
-			verbose("BUG_%02x\n", insn->code);
+			verbose(env, "BUG_%02x\n", insn->code);
 	} else if (class == BPF_ST) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
+			verbose(env, "BUG_st_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
 			insn->code,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->dst_reg,
 			insn->off, insn->imm);
 	} else if (class == BPF_LDX) {
 		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
 			return;
 		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
 			insn->code, insn->dst_reg,
 			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 			insn->src_reg, insn->off);
 	} else if (class == BPF_LD) {
 		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->imm);
 		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
 				insn->code,
 				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
 				insn->src_reg, insn->imm);
@@ -428,36 +425,37 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
 			if (map_ptr && !env->allow_ptr_leaks)
 				imm = 0;
 
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
 				insn->dst_reg, (unsigned long long)imm);
 		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
+			verbose(env, "BUG_ld_%02x\n", insn->code);
 			return;
 		}
 	} else if (class == BPF_JMP) {
 		u8 opcode = BPF_OP(insn->code);
 
 		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
 				func_id_name(insn->imm), insn->imm);
 		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
+			verbose(env, "(%02x) goto pc%+d\n",
 				insn->code, insn->off);
 		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
+			verbose(env, "(%02x) exit\n", insn->code);
 		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->src_reg, insn->off);
 		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
 				insn->code, insn->dst_reg,
 				bpf_jmp_string[BPF_OP(insn->code) >> 4],
 				insn->imm, insn->off);
 		}
 	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
 	}
 }
 
@@ -496,7 +494,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	env->head = elem;
 	env->stack_size++;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-		verbose("BPF program is too complex\n");
+		verbose(env, "BPF program is too complex\n");
 		goto err;
 	}
 	return &elem->st;
@@ -534,10 +532,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+				struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -647,10 +646,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	__mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -665,10 +665,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
 	reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -677,22 +678,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *regs)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		mark_reg_not_init(regs, i);
+		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
-	mark_reg_known_zero(regs, BPF_REG_FP);
+	mark_reg_known_zero(env, regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-	mark_reg_known_zero(regs, BPF_REG_1);
+	mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -726,26 +728,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *regs = env->cur_state.regs;
 
 	if (regno >= MAX_BPF_REG) {
-		verbose("R%d is invalid\n", regno);
+		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (regs[regno].type == NOT_INIT) {
-			verbose("R%d !read_ok\n", regno);
+			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
 		mark_reg_read(&env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
-			verbose("frame pointer is read only\n");
+			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
 		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
-			mark_reg_unknown(regs, regno);
+			mark_reg_unknown(env, regs, regno);
 	}
 	return 0;
 }
@@ -770,7 +772,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
 	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
@@ -783,7 +786,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off,
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 
@@ -818,7 +821,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 	}
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
 	u8 *slot_type;
@@ -828,12 +832,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 
 	if (slot_type[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
 			if (slot_type[i] != STACK_SPILL) {
-				verbose("corrupted spill memory\n");
+				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
@@ -849,14 +853,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
 	} else {
 		for (i = 0; i < size; i++) {
 			if (slot_type[i] != STACK_MISC) {
-				verbose("invalid read from stack off %d+%d size %d\n",
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
 			}
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 		return 0;
 	}
 }
@@ -868,7 +872,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
-		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
 	}
@@ -887,8 +891,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (verifier_log.level)
-		print_verifier_state(state);
+	if (env->log.level)
+		print_verifier_state(env, state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -896,13 +900,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * will have a set floor within our range.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->smin_value + off, size);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n", regno);
+		verbose(env, "R%d min value is outside of the array range\n",
+			regno);
 		return err;
 	}
 
@@ -911,13 +916,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
 	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_map_access(env, regno, reg->umax_value + off, size);
 	if (err)
-		verbose("R%d max value is outside of the array range\n", regno);
+		verbose(env, "R%d max value is outside of the array range\n",
+			regno);
 	return err;
 }
 
@@ -956,7 +962,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
@@ -979,13 +985,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	 * detail to prove they're safe.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
 	err = __check_packet_access(env, regno, off, size);
 	if (err) {
-		verbose("R%d offset is outside of the packet\n", regno);
+		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
 	return err;
@@ -1021,7 +1027,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		return 0;
 	}
 
-	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
 }
 
@@ -1039,7 +1045,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
 	struct tnum reg_off;
@@ -1064,7 +1071,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+		verbose(env,
+			"misaligned packet access off %d+%s+%d+%d size %d\n",
 			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1072,7 +1080,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+				       const struct bpf_reg_state *reg,
 				       const char *pointer_desc,
 				       int off, int size, bool strict)
 {
@@ -1087,7 +1096,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
 			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1108,7 +1117,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 		/* Special case, because of NET_IP_ALIGN. Given metadata sits
 		 * right in front, treat it the very same way.
 		 */
-		return check_pkt_ptr_alignment(reg, off, size, strict);
+		return check_pkt_ptr_alignment(env, reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1121,7 +1130,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	default:
 		break;
 	}
-	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+					   strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1153,20 +1163,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into map\n", value_regno);
+			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into ctx\n", value_regno);
+			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
 		/* ctx accesses must be at a fixed offset, so that we can
@@ -1176,7 +1186,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable ctx access var_off=%s off=%d size=%d",
+			verbose(env,
+				"variable ctx access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
@@ -1188,9 +1199,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(state->regs, value_regno);
+				mark_reg_unknown(env, state->regs, value_regno);
 			else
-				mark_reg_known_zero(state->regs, value_regno);
+				mark_reg_known_zero(env, state->regs,
+						    value_regno);
 			state->regs[value_regno].id = 0;
 			state->regs[value_regno].off = 0;
 			state->regs[value_regno].range = 0;
@@ -1206,13 +1218,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable stack access var_off=%s off=%d size=%d",
+			verbose(env, "variable stack access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose("invalid stack off=%d size=%d\n", off, size);
+			verbose(env, "invalid stack off=%d size=%d\n", off,
+				size);
 			return -EACCES;
 		}
 
@@ -1223,29 +1236,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			if (!env->allow_ptr_leaks &&
 			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
 			    size != BPF_REG_SIZE) {
-				verbose("attempt to corrupt spilled pointer on stack\n");
+				verbose(env, "attempt to corrupt spilled pointer on stack\n");
 				return -EACCES;
 			}
-			err = check_stack_write(state, off, size, value_regno);
+			err = check_stack_write(env, state, off, size,
+						value_regno);
 		} else {
-			err = check_stack_read(state, off, size, value_regno);
+			err = check_stack_read(env, state, off, size,
+					       value_regno);
 		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-			verbose("cannot write into packet\n");
+			verbose(env, "cannot write into packet\n");
 			return -EACCES;
 		}
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into packet\n", value_regno);
+			verbose(env, "R%d leaks addr into packet\n",
+				value_regno);
 			return -EACCES;
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 	} else {
-		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[reg->type]);
+		verbose(env, "R%d invalid mem access '%s'\n", regno,
+			reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
@@ -1265,7 +1281,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 	    insn->imm != 0) {
-		verbose("BPF_XADD uses reserved fields\n");
+		verbose(env, "BPF_XADD uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1280,7 +1296,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return err;
 
 	if (is_pointer_value(env, insn->src_reg)) {
-		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
 	}
 
@@ -1321,7 +1337,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		    register_is_null(regs[regno]))
 			return 0;
 
-		verbose("R%d type=%s expected=%s\n", regno,
+		verbose(env, "R%d type=%s expected=%s\n", regno,
 			reg_type_str[regs[regno].type],
 			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
@@ -1332,13 +1348,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-		verbose("invalid variable stack read R%d var_off=%s\n",
+		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
 	    access_size <= 0) {
-		verbose("invalid stack type R%d off=%d access_size=%d\n",
+		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
 	}
@@ -1354,7 +1370,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 
 	for (i = 0; i < access_size; i++) {
 		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-			verbose("invalid indirect read from stack off %d+%d size %d\n",
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
 		}
@@ -1397,7 +1413,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
-			verbose("R%d leaks addr into helper function\n", regno);
+			verbose(env, "R%d leaks addr into helper function\n",
+				regno);
 			return -EACCES;
 		}
 		return 0;
@@ -1405,7 +1422,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-		verbose("helper access to the packet is not allowed\n");
+		verbose(env, "helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
@@ -1443,7 +1460,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
-		verbose("unsupported arg_type %d\n", arg_type);
+		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
 	}
 
@@ -1461,7 +1478,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 * we have to check map_key here. Otherwise it means
 			 * that kernel subsystem misconfigured verifier
 			 */
-			verbose("invalid map_ptr to access map->key\n");
+			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1477,7 +1494,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("invalid map_ptr to access map->value\n");
+			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
 		if (type_is_pkt_pointer(type))
@@ -1497,7 +1514,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_SIZE cannot be first argument\n");
+			verbose(env,
+				"ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1514,7 +1532,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			meta = NULL;
 
 		if (reg->smin_value < 0) {
-			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1528,7 +1546,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 
 		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1539,12 +1557,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	return err;
 err_type:
-	verbose("R%d type=%s expected=%s\n", regno,
+	verbose(env, "R%d type=%s expected=%s\n", regno,
 		reg_type_str[type], reg_type_str[expected_type]);
 	return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map, int func_id)
 {
 	if (!map)
 		return 0;
@@ -1632,7 +1651,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %s#%d\n",
+	verbose(env, "cannot pass map_type %d into func %s#%d\n",
 		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
@@ -1666,7 +1685,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 	for (i = 0; i < MAX_BPF_REG; i++)
 		if (reg_is_pkt_pointer_any(&regs[i]))
-			mark_reg_unknown(regs, i);
+			mark_reg_unknown(env, regs, i);
 
 	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
 		if (state->stack_slot_type[i] != STACK_SPILL)
@@ -1688,7 +1707,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
@@ -1696,13 +1716,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		fn = env->prog->aux->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose("cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
 
@@ -1716,7 +1737,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %s#%d\n",
+		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
 		return err;
 	}
@@ -1749,14 +1770,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
-		mark_reg_unknown(regs, BPF_REG_0);
+		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1764,14 +1785,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
-		mark_reg_known_zero(regs, BPF_REG_0);
+		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
 		if (meta.map_ptr == NULL) {
-			verbose("kernel subsystem misconfigured verifier\n");
+			verbose(env,
+				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1782,12 +1804,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
-		verbose("unknown return type %d of func %s#%d\n",
+		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(meta.map_ptr, func_id);
+	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
@@ -1846,39 +1868,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad sbounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad ubounds\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+			verbose(env,
+				"R%d 32-bit pointer arithmetic prohibited\n",
 				dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
 				dst);
 		return -EACCES;
 	}
@@ -1943,7 +1968,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract pointer from scalar\n",
+				verbose(env, "R%d tried to subtract pointer from scalar\n",
 					dst);
 			return -EACCES;
 		}
@@ -1953,7 +1978,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
 			if (!env->allow_ptr_leaks)
-				verbose("R%d subtraction from stack pointer prohibited\n",
+				verbose(env, "R%d subtraction from stack pointer prohibited\n",
 					dst);
 			return -EACCES;
 		}
@@ -2008,13 +2033,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * ptr &= ~3 which would reduce min_value by 3.)
 		 */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d bitwise operator %s on pointer prohibited\n",
+			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
@@ -2180,7 +2205,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* We lose all sign bit information (except what we can pick
@@ -2208,7 +2233,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2236,7 +2261,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	default:
-		mark_reg_unknown(regs, insn->dst_reg);
+		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
@@ -2268,12 +2293,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * an arbitrary scalar.
 				 */
 				if (!env->allow_ptr_leaks) {
-					verbose("R%d pointer %s pointer prohibited\n",
+					verbose(env, "R%d pointer %s pointer prohibited\n",
 						insn->dst_reg,
 						bpf_alu_string[opcode >> 4]);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				return 0;
 			} else {
 				/* scalar += pointer
@@ -2325,13 +2350,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: unexpected ptr_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: no src_reg\n");
+		print_verifier_state(env, &env->cur_state);
+		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
 	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2349,14 +2374,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (BPF_SRC(insn->code) != 0 ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->off != 0 || insn->imm != 0) {
-				verbose("BPF_NEG uses reserved fields\n");
+				verbose(env, "BPF_NEG uses reserved fields\n");
 				return -EINVAL;
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
 			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
 			    BPF_CLASS(insn->code) == BPF_ALU64) {
-				verbose("BPF_END uses reserved fields\n");
+				verbose(env, "BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2367,7 +2392,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
+			verbose(env, "R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
 			return -EACCES;
 		}
@@ -2381,7 +2406,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 
@@ -2391,7 +2416,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2411,11 +2436,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-					verbose("R%d partial copy of pointer\n",
+					verbose(env,
+						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
@@ -2430,14 +2456,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 	} else if (opcode > BPF_END) {
-		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src1 operand */
@@ -2446,7 +2472,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2458,7 +2484,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
 		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-			verbose("div by zero\n");
+			verbose(env, "div by zero\n");
 			return -EINVAL;
 		}
 
@@ -2467,7 +2493,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {
-				verbose("invalid shift %d\n", insn->imm);
+				verbose(env, "invalid shift %d\n", insn->imm);
 				return -EINVAL;
 			}
 		}
@@ -2820,13 +2846,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 	int err;
 
 	if (opcode > BPF_JSLE) {
-		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (BPF_SRC(insn->code) == BPF_X) {
 		if (insn->imm != 0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -2836,13 +2862,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			return err;
 
 		if (is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer comparison prohibited\n",
+			verbose(env, "R%d pointer comparison prohibited\n",
 				insn->src_reg);
 			return -EACCES;
 		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 	}
@@ -2954,11 +2980,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
 				       PTR_TO_PACKET_META);
 	} else if (is_pointer_value(env, insn->dst_reg)) {
-		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		verbose(env, "R%d pointer comparison prohibited\n",
+			insn->dst_reg);
 		return -EACCES;
 	}
-	if (verifier_log.level)
-		print_verifier_state(this_branch);
+	if (env->log.level)
+		print_verifier_state(env, this_branch);
 	return 0;
 }
 
@@ -2977,11 +3004,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
-		verbose("invalid BPF_LD_IMM insn\n");
+		verbose(env, "invalid BPF_LD_IMM insn\n");
 		return -EINVAL;
 	}
 	if (insn->off != 0) {
-		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3039,14 +3066,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3056,7 +3083,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		verbose(env,
+			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
 		return -EINVAL;
 	}
 
@@ -3069,7 +3097,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	/* reset caller saved regs to unreadable */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
@@ -3077,7 +3105,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * the value fetched from the packet.
 	 * Already marked as written above.
 	 */
-	mark_reg_unknown(regs, BPF_REG_0);
+	mark_reg_unknown(env, regs, BPF_REG_0);
 	return 0;
 }
 
@@ -3097,22 +3125,22 @@ static int check_return_code(struct bpf_verifier_env *env)
 
 	reg = &env->cur_state.regs[BPF_REG_0];
 	if (reg->type != SCALAR_VALUE) {
-		verbose("At program exit the register R0 is not a known value (%s)\n",
+		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
 		return -EINVAL;
 	}
 
 	if (!tnum_in(range, reg->var_off)) {
-		verbose("At program exit the register R0 ");
+		verbose(env, "At program exit the register R0 ");
 		if (!tnum_is_unknown(reg->var_off)) {
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("has value %s", tn_buf);
+			verbose(env, "has value %s", tn_buf);
 		} else {
-			verbose("has unknown scalar value");
+			verbose(env, "has unknown scalar value");
 		}
-		verbose(" should have been 0 or 1\n");
+		verbose(env, " should have been 0 or 1\n");
 		return -EINVAL;
 	}
 	return 0;
@@ -3178,7 +3206,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
-		verbose("jump out of range from insn %d to %d\n", t, w);
+		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
 
@@ -3195,13 +3223,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		verbose("back-edge from insn %d to %d\n", t, w);
+		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
 		/* forward- or cross-edge */
 		insn_state[t] = DISCOVERED | e;
 	} else {
-		verbose("insn state internal bug\n");
+		verbose(env, "insn state internal bug\n");
 		return -EFAULT;
 	}
 	return 0;
@@ -3295,7 +3323,7 @@ peek_stack:
 mark_explored:
 	insn_state[t] = EXPLORED;
 	if (cur_stack-- <= 0) {
-		verbose("pop stack internal bug\n");
+		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
 	}
@@ -3304,7 +3332,7 @@ mark_explored:
 check_state:
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
-			verbose("unreachable insn %d\n", i);
+			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
@@ -3685,7 +3713,7 @@ static int do_check(struct bpf_verifier_env *env)
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(regs);
+	init_reg_state(env, regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3694,7 +3722,7 @@ static int do_check(struct bpf_verifier_env *env)
 		int err;
 
 		if (insn_idx >= insn_cnt) {
-			verbose("invalid insn idx %d insn_cnt %d\n",
+			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				insn_idx, insn_cnt);
 			return -EFAULT;
 		}
@@ -3703,7 +3731,8 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Processed %d insn\n",
+			verbose(env,
+				"BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
@@ -3713,12 +3742,12 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (verifier_log.level) {
+			if (env->log.level) {
 				if (do_print_state)
-					verbose("\nfrom %d to %d: safe\n",
+					verbose(env, "\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
 				else
-					verbose("%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -3726,19 +3755,18 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (verifier_log.level > 1 ||
-		    (verifier_log.level && do_print_state)) {
-			if (verifier_log.level > 1)
-				verbose("%d:", insn_idx);
+		if (env->log.level > 1 || (env->log.level && do_print_state)) {
+			if (env->log.level > 1)
+				verbose(env, "%d:", insn_idx);
 			else
-				verbose("\nfrom %d to %d:",
+				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(&env->cur_state);
+			print_verifier_state(env, &env->cur_state);
 			do_print_state = false;
 		}
 
-		if (verifier_log.level) {
-			verbose("%d: ", insn_idx);
+		if (env->log.level) {
+			verbose(env, "%d: ", insn_idx);
 			print_bpf_insn(env, insn);
 		}
 
@@ -3795,7 +3823,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
@@ -3835,14 +3863,14 @@ static int do_check(struct bpf_verifier_env *env)
 			} else if (dst_reg_type != *prev_dst_type &&
 				   (dst_reg_type == PTR_TO_CTX ||
 				    *prev_dst_type == PTR_TO_CTX)) {
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
-				verbose("BPF_ST uses reserved fields\n");
+				verbose(env, "BPF_ST uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src operand */
@@ -3865,7 +3893,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->off != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_CALL uses reserved fields\n");
+					verbose(env, "BPF_CALL uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3878,7 +3906,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_JA uses reserved fields\n");
+					verbose(env, "BPF_JA uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3890,7 +3918,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_EXIT uses reserved fields\n");
+					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3905,7 +3933,7 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 
 				if (is_pointer_value(env, BPF_REG_0)) {
-					verbose("R0 leaks addr as return value\n");
+					verbose(env, "R0 leaks addr as return value\n");
 					return -EACCES;
 				}
 
@@ -3940,19 +3968,19 @@ process_bpf_exit:
 
 				insn_idx++;
 			} else {
-				verbose("invalid BPF_LD mode\n");
+				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
 		} else {
-			verbose("unknown insn class %d\n", class);
+			verbose(env, "unknown insn class %d\n", class);
 			return -EINVAL;
 		}
 
 		insn_idx++;
 	}
 
-	verbose("processed %d insns, stack depth %d\n",
-		insn_processed, env->prog->aux->stack_depth);
+	verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+		env->prog->aux->stack_depth);
 	return 0;
 }
 
@@ -3964,7 +3992,8 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
@@ -3975,12 +4004,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 	 */
 	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
 		if (!check_map_prealloc(map)) {
-			verbose("perf_event programs can only use preallocated hash map\n");
+			verbose(env, "perf_event programs can only use preallocated hash map\n");
 			return -EINVAL;
 		}
 		if (map->inner_map_meta &&
 		    !check_map_prealloc(map->inner_map_meta)) {
-			verbose("perf_event programs can only use preallocated inner hash map\n");
+			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
 			return -EINVAL;
 		}
 	}
@@ -4003,14 +4032,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-			verbose("BPF_LDX uses reserved fields\n");
+			verbose(env, "BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
 		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-			verbose("BPF_STX uses reserved fields\n");
+			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -4021,7 +4050,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
 			    insn[1].off != 0) {
-				verbose("invalid bpf_ld_imm64 insn\n");
+				verbose(env, "invalid bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -4030,19 +4059,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				goto next_insn;
 
 			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose("unrecognized bpf_ld_imm64 insn\n");
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
 			f = fdget(insn->imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
-				verbose("fd %d is not pointing to valid bpf_map\n",
+				verbose(env, "fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
 				return PTR_ERR(map);
 			}
 
-			err = check_map_prog_compatibility(map, env->prog);
+			err = check_map_prog_compatibility(env, map, env->prog);
 			if (err) {
 				fdput(f);
 				return err;
@@ -4164,7 +4194,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
 					env->prog);
 		if (cnt >= ARRAY_SIZE(insn_buf)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		} else if (cnt) {
 			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4212,7 +4242,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
-				verbose("bpf verifier narrow ctx access misconfigured\n");
+				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4231,7 +4261,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					      &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
 
@@ -4313,7 +4343,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
 			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose("bpf verifier is misconfigured\n");
+				verbose(env, "bpf verifier is misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4357,7 +4387,8 @@ patch_call_imm:
 		 * programs to call them, must be real in-kernel functions
 		 */
 		if (!fn->func) {
-			verbose("kernel subsystem misconfigured func %s#%d\n",
+			verbose(env,
+				"kernel subsystem misconfigured func %s#%d\n",
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
@@ -4391,8 +4422,8 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	struct bpf_verifer_log *log = &verifier_log;
 	struct bpf_verifier_env *env;
+	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
@@ -4401,6 +4432,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
+	log = &env->log;
 
 	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
 				     (*prog)->len);
@@ -4419,7 +4451,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->level = attr->log_level;
 		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
 		log->len_total = attr->log_size;
-		log->len_used = 0;
 
 		ret = -EINVAL;
 		/* log attributes have to be sane */
@@ -4431,8 +4462,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log->kbuf = vmalloc(log->len_total);
 		if (!log->kbuf)
 			goto err_unlock;
-	} else {
-		log->level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4543,8 +4572,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
 
-	verifier_log.level = 0;
-
 	env->strict_alignment = false;
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
-- 
cgit v1.3-14-g43fede


From f4ac7e0b5cc8d16004ac57ff679266d573f30f77 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:12 -0700
Subject: bpf: move instruction printing into a separate file

Separate the instruction printing into a standalone source file.
This way sneaky code from tools/ can compile it in directly.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/Makefile   |   1 +
 kernel/bpf/disasm.c   | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/disasm.h   |  32 ++++++++
 kernel/bpf/verifier.c | 202 +----------------------------------------------
 4 files changed, 251 insertions(+), 198 deletions(-)
 create mode 100644 kernel/bpf/disasm.c
 create mode 100644 kernel/bpf/disasm.h

(limited to 'kernel')

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 897daa005b23..53fb09f92e3f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,6 +2,7 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+		return func_id_str[id];
+	else
+		return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JLT >> 4]  = "<",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JLE >> 4]  = "<=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSLT >> 4] = "s<",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_JSLE >> 4] = "s<=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+			       struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(verbose, env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose(env, "(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		} else {
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+		}
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose(env, "BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM &&
+			   BPF_SIZE(insn->code) == BPF_DW) {
+			/* At this point, we already made sure that the second
+			 * part of the ldimm64 insn is accessible.
+			 */
+			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+			if (map_ptr && !allow_ptr_leaks)
+				imm = 0;
+
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+				insn->dst_reg, (unsigned long long)imm);
+		} else {
+			verbose(env, "BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
+				func_id_name(insn->imm), insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose(env, "(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose(env, "(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
+	}
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+				  const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a352f93cd4b2..274c6582ec39 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,8 @@
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
 
+#include "disasm.h"
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -194,22 +196,6 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-		return func_id_str[id];
-	else
-		return "unknown";
-}
-
 static void print_verifier_state(struct bpf_verifier_env *env,
 				 struct bpf_verifier_state *state)
 {
@@ -278,187 +264,6 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 	verbose(env, "\n");
 }
 
-static const char *const bpf_class_string[] = {
-	[BPF_LD]    = "ld",
-	[BPF_LDX]   = "ldx",
-	[BPF_ST]    = "st",
-	[BPF_STX]   = "stx",
-	[BPF_ALU]   = "alu",
-	[BPF_JMP]   = "jmp",
-	[BPF_RET]   = "BUG",
-	[BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-	[BPF_ADD >> 4]  = "+=",
-	[BPF_SUB >> 4]  = "-=",
-	[BPF_MUL >> 4]  = "*=",
-	[BPF_DIV >> 4]  = "/=",
-	[BPF_OR  >> 4]  = "|=",
-	[BPF_AND >> 4]  = "&=",
-	[BPF_LSH >> 4]  = "<<=",
-	[BPF_RSH >> 4]  = ">>=",
-	[BPF_NEG >> 4]  = "neg",
-	[BPF_MOD >> 4]  = "%=",
-	[BPF_XOR >> 4]  = "^=",
-	[BPF_MOV >> 4]  = "=",
-	[BPF_ARSH >> 4] = "s>>=",
-	[BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-	[BPF_W >> 3]  = "u32",
-	[BPF_H >> 3]  = "u16",
-	[BPF_B >> 3]  = "u8",
-	[BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-	[BPF_JA >> 4]   = "jmp",
-	[BPF_JEQ >> 4]  = "==",
-	[BPF_JGT >> 4]  = ">",
-	[BPF_JLT >> 4]  = "<",
-	[BPF_JGE >> 4]  = ">=",
-	[BPF_JLE >> 4]  = "<=",
-	[BPF_JSET >> 4] = "&",
-	[BPF_JNE >> 4]  = "!=",
-	[BPF_JSGT >> 4] = "s>",
-	[BPF_JSLT >> 4] = "s<",
-	[BPF_JSGE >> 4] = "s>=",
-	[BPF_JSLE >> 4] = "s<=",
-	[BPF_CALL >> 4] = "call",
-	[BPF_EXIT >> 4] = "exit",
-};
-
-static void print_bpf_end_insn(struct bpf_verifier_env *env,
-			       const struct bpf_insn *insn)
-{
-	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
-		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
-		insn->imm, insn->dst_reg);
-}
-
-static void print_bpf_insn(struct bpf_verifier_env *env,
-			   const struct bpf_insn *insn)
-{
-	u8 class = BPF_CLASS(insn->code);
-
-	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_OP(insn->code) == BPF_END) {
-			if (class == BPF_ALU64)
-				verbose(env, "BUG_alu64_%02x\n", insn->code);
-			else
-				print_bpf_end_insn(env, insn);
-		} else if (BPF_OP(insn->code) == BPF_NEG) {
-			verbose(env, "(%02x) r%d = %s-r%d\n",
-				insn->code, insn->dst_reg,
-				class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) %sr%d %s %sr%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->src_reg);
-		} else {
-			verbose(env, "(%02x) %sr%d %s %s%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->imm);
-		}
-	} else if (class == BPF_STX) {
-		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg,
-				insn->off, insn->src_reg);
-		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg, insn->off,
-				insn->src_reg);
-		else
-			verbose(env, "BUG_%02x\n", insn->code);
-	} else if (class == BPF_ST) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_st_%02x\n", insn->code);
-			return;
-		}
-		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
-			insn->code,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->dst_reg,
-			insn->off, insn->imm);
-	} else if (class == BPF_LDX) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose(env, "BUG_ldx_%02x\n", insn->code);
-			return;
-		}
-		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
-			insn->code, insn->dst_reg,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->src_reg, insn->off);
-	} else if (class == BPF_LD) {
-		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->src_reg, insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IMM &&
-			   BPF_SIZE(insn->code) == BPF_DW) {
-			/* At this point, we already made sure that the second
-			 * part of the ldimm64 insn is accessible.
-			 */
-			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
-
-			if (map_ptr && !env->allow_ptr_leaks)
-				imm = 0;
-
-			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
-				insn->dst_reg, (unsigned long long)imm);
-		} else {
-			verbose(env, "BUG_ld_%02x\n", insn->code);
-			return;
-		}
-	} else if (class == BPF_JMP) {
-		u8 opcode = BPF_OP(insn->code);
-
-		if (opcode == BPF_CALL) {
-			verbose(env, "(%02x) call %s#%d\n", insn->code,
-				func_id_name(insn->imm), insn->imm);
-		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose(env, "(%02x) goto pc%+d\n",
-				insn->code, insn->off);
-		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose(env, "(%02x) exit\n", insn->code);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->src_reg, insn->off);
-		} else {
-			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->imm, insn->off);
-		}
-	} else {
-		verbose(env, "(%02x) %s\n",
-			insn->code, bpf_class_string[class]);
-	}
-}
-
 static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
 {
 	struct bpf_verifier_stack_elem *elem;
@@ -3767,7 +3572,8 @@ static int do_check(struct bpf_verifier_env *env)
 
 		if (env->log.level) {
 			verbose(env, "%d: ", insn_idx);
-			print_bpf_insn(env, insn);
+			print_bpf_insn(verbose, env, insn,
+				       env->allow_ptr_leaks);
 		}
 
 		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
-- 
cgit v1.3-14-g43fede


From a2a7d5701052542cd2260e7659b12443e0a74733 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 9 Oct 2017 10:30:15 -0700
Subject: bpf: write back the verifier log buffer as it gets filled

Verifier log buffer can be quite large (up to 16MB currently).
As Eric Dumazet points out if we allow multiple verification
requests to proceed simultaneously, malicious user may use the
verifier as a way of allocating large amounts of unswappable
memory to OOM the host.

Switch to a strategy of allocating a smaller buffer (1024B)
and writing it out into the user buffer after every print.

While at it remove the old BUG_ON().

This is in preparation of the global verifier lock removal.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  4 +++-
 kernel/bpf/verifier.c        | 41 +++++++++++++++++++----------------------
 2 files changed, 22 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 5ddb9a626a51..f00ef751c1c5 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,9 +115,11 @@ struct bpf_insn_aux_data {
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
 
+#define BPF_VERIFIER_TMP_LOG_SIZE	1024
+
 struct bpf_verifer_log {
 	u32 level;
-	char *kbuf;
+	char kbuf[BPF_VERIFIER_TMP_LOG_SIZE];
 	char __user *ubuf;
 	u32 len_used;
 	u32 len_total;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 274c6582ec39..2cdbcc4f8f6b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,15 +165,26 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
 				   const char *fmt, ...)
 {
 	struct bpf_verifer_log *log = &env->log;
+	unsigned int n;
 	va_list args;
 
-	if (!log->level || bpf_verifier_log_full(log))
+	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log->len_used += vscnprintf(log->kbuf + log->len_used,
-				    log->len_total - log->len_used, fmt, args);
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 	va_end(args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
 }
 
 static bool type_is_pkt_pointer(enum bpf_reg_type type)
@@ -4263,11 +4274,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
 		    !log->level || !log->ubuf)
 			goto err_unlock;
-
-		ret = -ENOMEM;
-		log->kbuf = vmalloc(log->len_total);
-		if (!log->kbuf)
-			goto err_unlock;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
@@ -4304,18 +4310,11 @@ skip_full_check:
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log->level && bpf_verifier_log_full(log)) {
-		BUG_ON(log->len_used >= log->len_total);
-		/* verifier log exceeded user supplied buffer */
+	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
-		/* fall through to return what was recorded */
-	}
-
-	/* copy verifier log back to user space including trailing zero */
-	if (log->level && copy_to_user(log->ubuf, log->kbuf,
-				       log->len_used + 1) != 0) {
+	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
-		goto free_log_buf;
+		goto err_release_maps;
 	}
 
 	if (ret == 0 && env->used_map_cnt) {
@@ -4326,7 +4325,7 @@ skip_full_check:
 
 		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
-			goto free_log_buf;
+			goto err_release_maps;
 		}
 
 		memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4339,9 +4338,7 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
-free_log_buf:
-	if (log->level)
-		vfree(log->kbuf);
+err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
-- 
cgit v1.3-14-g43fede


From d59158162e032917a428704160a2063a02405ec6 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Tue, 10 Oct 2017 15:51:37 -0700
Subject: tracing: Add support for preempt and irq enable/disable events

Preempt and irq trace events can be used for tracing the start and
end of an atomic section which can be used by a trace viewer like
systrace to graphically view the start and end of an atomic section and
correlate them with latencies and scheduling issues.

This also serves as a prelude to using synthetic events or probes to
rewrite the preempt and irqsoff tracers, along with numerous benefits of
using trace events features for these events.
Link: http://lkml.kernel.org/r/20171006005432.14244-3-joelaf@google.com
Link: http://lkml.kernel.org/r/20171010225137.17370-1-joelaf@google.com

Cc: Peter Zilstra <peterz@infradead.org>
Cc: kernel-team@android.com
Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h            |  3 +-
 include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/Kconfig              | 11 ++++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/trace_irqsoff.c      | 35 +++++++++++++++++++-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/preemptirq.h

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 346f8294e40a..1f8545caa691 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -769,7 +769,8 @@ static inline unsigned long get_lock_parent_ip(void)
   static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
   extern void trace_preempt_on(unsigned long a0, unsigned long a1);
   extern void trace_preempt_off(unsigned long a0, unsigned long a1);
 #else
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h
new file mode 100644
index 000000000000..f5024c560d8f
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,70 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+	TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+	TP_ARGS(ip, parent_ip),
+
+	TP_STRUCT__entry(
+		__field(u32, caller_offs)
+		__field(u32, parent_offs)
+	),
+
+	TP_fast_assign(
+		__entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+		__entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+	),
+
+	TP_printk("caller=%pF parent=%pF",
+		  (void *)((unsigned long)(_stext) + __entry->caller_offs),
+		  (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+	     TP_PROTO(unsigned long ip, unsigned long parent_ip),
+	     TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..b8395a020821 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
 	  address on the current task structure into a stack of calls.
 
 
+config PREEMPTIRQ_EVENTS
+	bool "Enable trace events for preempt and irq disable/enable"
+	select TRACE_IRQFLAGS
+	depends on DEBUG_PREEMPT || !PROVE_LOCKING
+	default n
+	help
+	  Enable tracing of disable and enable events for preemption and irqs.
+	  For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+	  enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+	  be disabled.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 90f2701d92a7..9f62eee61f14 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 0e3033c00474..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,9 @@
 
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
 #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
@@ -777,26 +780,53 @@ static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
 #endif
 
 #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
 void trace_hardirqs_on(void)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_on();
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void trace_hardirqs_off(void)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
 	tracer_hardirqs_off();
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 __visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_on_caller(caller_addr);
+
+	this_cpu_write(tracing_irq_cpu, 0);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 __visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
 	tracer_hardirqs_off_caller(caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
@@ -818,14 +848,17 @@ inline void print_irqtrace_events(struct task_struct *curr)
 }
 #endif
 
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_enable_rcuidle(a0, a1);
 	tracer_preempt_on(a0, a1);
 }
 
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
+	trace_preempt_disable_rcuidle(a0, a1);
 	tracer_preempt_off(a0, a1);
 }
 #endif
-- 
cgit v1.3-14-g43fede


From 8715b108cd75523c9b2e833cdcd7aeb363767f95 Mon Sep 17 00:00:00 2001
From: Joel Fernandes <joelaf@google.com>
Date: Mon, 9 Oct 2017 12:29:31 -0700
Subject: ftrace: Clear hashes of stale ips of init memory

Filters should be cleared of init functions during freeing of init
memory when the ftrace dyn records are released. However in current
code, the filters are left as is. This patch clears the hashes of the
saved init functions when the init memory is freed. This fixes the
following issue reproducible with the following sequence of commands for
a test module:
================================================

void bar(void)
{
    printk(KERN_INFO "bar!\n");
}

void foo(void)
{
    printk(KERN_INFO "foo!\n");
    bar();
}

static int __init hello_init(void)
{
    printk(KERN_INFO "Hello world!\n");
    foo();
    return 0;
}

static void __exit hello_cleanup(void)
{
    printk(KERN_INFO "Cleaning up module.\n");
}

module_init(hello_init);
module_exit(hello_cleanup);
================================================

Commands:
echo '*:mod:test' > /d/tracing/set_ftrace_filter
echo function > /d/tracing/current_tracer
modprobe test
rmmod test
sleep 1
modprobe test
cat /d/tracing/set_ftrace_filter

Behavior without patch: Init function is still in the filter
Expected behavior: Shouldn't have any of the filters set

Link: http://lkml.kernel.org/r/20171009192931.56401-1-joelaf@google.com

Signed-off-by: Joel Fernandes <joelaf@google.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e99bd55732e..e0a98225666b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -6067,6 +6067,63 @@ allocate_ftrace_mod_map(struct module *mod,
 }
 #endif /* CONFIG_MODULES */
 
+struct ftrace_init_func {
+	struct list_head list;
+	unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+	struct ftrace_func_entry *entry;
+
+	if (ftrace_hash_empty(hash))
+		return;
+
+	entry = __ftrace_lookup_ip(hash, func->ip);
+
+	/*
+	 * Do not allow this rec to match again.
+	 * Yeah, it may waste some memory, but will be removed
+	 * if/when the hash is modified again.
+	 */
+	if (entry)
+		entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+	struct trace_array *tr;
+
+	mutex_lock(&trace_types_lock);
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (!tr->ops || !tr->ops->func_hash)
+			continue;
+		mutex_lock(&tr->ops->func_hash->regex_lock);
+		clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+		clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+		mutex_unlock(&tr->ops->func_hash->regex_lock);
+	}
+	mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+				   struct dyn_ftrace *rec)
+{
+	struct ftrace_init_func *func;
+
+	func = kmalloc(sizeof(*func), GFP_KERNEL);
+	if (!func) {
+		WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+		return;
+	}
+
+	func->ip = rec->ip;
+	list_add(&func->list, clear_list);
+}
+
 void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
 	unsigned long start = (unsigned long)(start_ptr);
@@ -6076,8 +6133,12 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
 	struct ftrace_mod_map *mod_map = NULL;
+	struct ftrace_init_func *func, *func_next;
+	struct list_head clear_hash;
 	int order;
 
+	INIT_LIST_HEAD(&clear_hash);
+
 	key.ip = start;
 	key.flags = end;	/* overload flags, as it is unsigned long */
 
@@ -6102,6 +6163,9 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 		if (!rec)
 			continue;
 
+		/* rec will be cleared from hashes after ftrace_lock unlock */
+		add_to_clear_hash_list(&clear_hash, rec);
+
 		if (mod_map)
 			save_ftrace_mod_rec(mod_map, rec);
 
@@ -6123,6 +6187,11 @@ void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 		goto again;
 	}
 	mutex_unlock(&ftrace_lock);
+
+	list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+		clear_func_from_hashes(func);
+		kfree(func);
+	}
 }
 
 void __init ftrace_free_init_mem(void)
-- 
cgit v1.3-14-g43fede


From 952925dec0f276b407b2abce4aee82cba7c700c3 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 11 Oct 2017 11:56:23 +0100
Subject: bpf: remove redundant variable old_flags

Variable old_flags is being assigned but is never read; it is redundant
and can be removed.

Cleans up clang warning: Value stored to 'old_flags' is never read

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index e88abc0865d5..3db5a17fcfe8 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 	struct cgroup_subsys_state *css;
 	struct bpf_prog_list *pl;
 	bool pl_was_allocated;
-	u32 old_flags;
 	int err;
 
 	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
@@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
 		pl->prog = prog;
 	}
 
-	old_flags = cgrp->bpf.flags[type];
 	cgrp->bpf.flags[type] = flags;
 
 	/* allocate and recompute effective prog arrays */
-- 
cgit v1.3-14-g43fede


From c5c1ea75a352c3864c4891f36155287a2ed73928 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Tue, 13 Jun 2017 13:06:59 +0200
Subject: tracing: Kconfig text fixes for CONFIG_HWLAT_TRACER

Trivial spelling fixes for Kconfig help text of config HWLAT_TRACER.

Fixes: e7c15cd8a113 ("tracing: Added hardware latency tracer")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..f54b7b6b4a4b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -224,7 +224,7 @@ config HWLAT_TRACER
 	select GENERIC_TRACER
 	help
 	 This tracer, when enabled will create one or more kernel threads,
-	 depening on what the cpumask file is set to, which each thread
+	 depending on what the cpumask file is set to, which each thread
 	 spinning in a loop looking for interruptions caused by
 	 something other than the kernel. For example, if a
 	 System Management Interrupt (SMI) takes a noticeable amount of
@@ -239,7 +239,7 @@ config HWLAT_TRACER
 				     iteration
 
 	 A kernel thread is created that will spin with interrupts disabled
-	 for "width" microseconds in every "widow" cycle. It will not spin
+	 for "width" microseconds in every "window" cycle. It will not spin
 	 for "window - width" microseconds, where the system can
 	 continue to operate.
 
-- 
cgit v1.3-14-g43fede


From af41acf8347dd6d11a2a29a11e2866ca4892d600 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Wed, 11 Oct 2017 12:46:47 -0400
Subject: printk: Remove superfluous memory barriers from printk_safe

The variable printk_safe_irq_ready is set and never cleared at system
boot up, when there's only one CPU active. It is set before other
CPUs come on line. Also, it is extremely unlikely that an NMI would
trigger this early in boot up (which I wonder why we even have this
variable at all).

Also mark the printk_safe_irq_ready as read mostly, as it is set at
system boot up, and never touched again.

Link: http://lkml.kernel.org/r/20171011124647.7781f98f@gandalf.local.home

Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/printk/printk_safe.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..724d9292d4b9 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
  * There are situations when we want to make sure that all buffers
  * were handled or when IRQs are blocked.
  */
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
 
 #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
 				sizeof(atomic_t) -			\
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
 /* Get flushed in a more safe context. */
 static void queue_flush_work(struct printk_safe_seq_buf *s)
 {
-	if (printk_safe_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
+	if (printk_safe_irq_ready)
 		irq_work_queue(&s->work);
-	}
 }
 
 /*
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
 #endif
 	}
 
-	/* Make sure that IRQ works are initialized before enabling. */
-	smp_wmb();
+	/*
+	 * In the highly unlikely event that a NMI were to trigger at
+	 * this moment. Make sure IRQ work is set up before this
+	 * variable is set.
+	 */
+	barrier();
 	printk_safe_irq_ready = 1;
 
 	/* Flush pending messages that did not have scheduled IRQ works. */
-- 
cgit v1.3-14-g43fede


From c3b5b6ed1eb4f429addfd9e8e8eb39d1a38c85d0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 16:22:20 +0200
Subject: tracing: mark trace_test_buffer as __maybe_unused

After trace_selftest_startup_sched_switch is removed, trace_test_buffer()
is only used sometimes, leading to this warning:

kernel/trace/trace_selftest.c:62:12: error: 'trace_test_buffer' defined but not used [-Werror=unused-function]

There is no simple #ifdef condition that captures well whether the
function is in fact used or not, so marking it as __maybe_unused is
probably the best way to shut up the warning. The function will then
be silently dropped when there is no user.

Link: http://lkml.kernel.org/r/20171013142227.1273469-1-arnd@arndb.de

Fixes: d8c4deee6dc6 ("tracing: Remove obsolete sched_switch tracer selftest")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 364f78abdf47..eb9ba5c1ba40 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -59,7 +59,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
  * Test the trace buffer to see if all the elements
  * are still sane.
  */
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
 	unsigned long flags, cnt = 0;
 	int cpu, ret = 0;
-- 
cgit v1.3-14-g43fede


From 1bdec44955edc22fb840f5965987d2972307dcc9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 12 Oct 2017 10:34:07 -0700
Subject: bpf: verifier: set reg_type on context accesses in second pass

Use a simplified is_valid_access() callback when verifier
is used for program analysis by non-host JITs.  This allows
us to teach the verifier about packet start and packet end
offsets for direct packet access.

We can extend the callback as needed but for most packet
processing needs there isn't much more the offloads may
require.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 43 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cdbcc4f8f6b..9755279d94cb 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -813,6 +813,36 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
+static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
+				     struct bpf_insn_access_aux *info)
+{
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_XDP:
+		switch (off) {
+		case offsetof(struct xdp_buff, data):
+			info->reg_type = PTR_TO_PACKET;
+			return true;
+		case offsetof(struct xdp_buff, data_end):
+			info->reg_type = PTR_TO_PACKET_END;
+			return true;
+		}
+		return false;
+	case BPF_PROG_TYPE_SCHED_CLS:
+		switch (off) {
+		case offsetof(struct sk_buff, data):
+			info->reg_type = PTR_TO_PACKET;
+			return true;
+		case offsetof(struct sk_buff, cb) +
+		     offsetof(struct bpf_skb_data_end, data_end):
+			info->reg_type = PTR_TO_PACKET_END;
+			return true;
+		}
+		return false;
+	default:
+		return false;
+	}
+}
+
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -821,12 +851,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	/* for analyzer ctx accesses are already validated and converted */
-	if (env->analyzer_ops)
-		return 0;
-
-	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	if (env->analyzer_ops) {
+		if (analyzer_is_valid_access(env, off, &info)) {
+			*reg_type = info.reg_type;
+			return 0;
+		}
+	} else if (env->prog->aux->ops->is_valid_access &&
+		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
-- 
cgit v1.3-14-g43fede


From 9185a610f8f7f1b4e4d28c9de27d1969cf58e0f1 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
Date: Thu, 12 Oct 2017 18:40:02 -0400
Subject: tracing: bpf: Hide bpf trace events when they are not used

All the trace events defined in include/trace/events/bpf.h are only
used when CONFIG_BPF_SYSCALL is defined. But this file gets included by
include/linux/bpf_trace.h which is included by the networking code with
CREATE_TRACE_POINTS defined.

If a trace event is created but not used it still has data structures
and functions created for its use, even though nothing is using them.
To not waste space, do not define the BPF trace events in bpf.h unless
CONFIG_BPF_SYSCALL is defined.

Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/bpf.h | 5 ++++-
 kernel/bpf/core.c          | 3 +++
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
index 52c8425d144b..1fb58faa4a44 100644
--- a/include/trace/events/bpf.h
+++ b/include/trace/events/bpf.h
@@ -4,6 +4,9 @@
 #if !defined(_TRACE_BPF_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BPF_H
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
+
 #include <linux/filter.h>
 #include <linux/bpf.h>
 #include <linux/fs.h>
@@ -345,7 +348,7 @@ TRACE_EVENT(bpf_map_next_key,
 		  __print_hex(__get_dynamic_array(nxt), __entry->key_len),
 		  __entry->key_trunc ? " ..." : "")
 );
-
+#endif /* CONFIG_BPF_SYSCALL */
 #endif /* _TRACE_BPF_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 248961af2421..8e7c8bf2b687 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1580,5 +1580,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
-- 
cgit v1.3-14-g43fede


From 8fd0fbbe8888f295eb34172a7e47bf7d3a0a4687 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:29 +0200
Subject: perf/ftrace: Revert ("perf/ftrace: Fix double traces of perf on
 ftrace:function")

Revert commit:

  75e8387685f6 ("perf/ftrace: Fix double traces of perf on ftrace:function")

The reason I instantly stumbled on that patch is that it only addresses the
ftrace situation and doesn't mention the other _5_ places that use this
interface. It doesn't explain why those don't have the problem and if not, why
their solution doesn't work for ftrace.

It doesn't, but this is just putting more duct tape on.

Link: http://lkml.kernel.org/r/20171011080224.200565770@infradead.org

Cc: Zhou Chengming <zhouchengming1@huawei.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/perf_event.h      |  2 +-
 include/linux/trace_events.h    |  4 ++--
 kernel/events/core.c            | 13 ++++---------
 kernel/trace/trace_event_perf.c |  4 +---
 kernel/trace/trace_kprobe.c     |  4 ++--
 kernel/trace/trace_syscalls.c   |  4 ++--
 kernel/trace/trace_uprobe.c     |  2 +-
 7 files changed, 13 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..569d1b54e201 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1184,7 +1184,7 @@ extern void perf_event_init(void);
 extern void perf_tp_event(u16 event_type, u64 count, void *record,
 			  int entry_size, struct pt_regs *regs,
 			  struct hlist_head *head, int rctx,
-			  struct task_struct *task, struct perf_event *event);
+			  struct task_struct *task);
 extern void perf_bp_event(struct perf_event *event, void *data);
 
 #ifndef perf_misc_flags
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..a6349b76fd39 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -507,9 +507,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 static inline void
 perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 		       u64 count, struct pt_regs *regs, void *head,
-		       struct task_struct *task, struct perf_event *event)
+		       struct task_struct *task)
 {
-	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
+	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task);
 }
 #endif
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6bc21e202ae4..b8db80c5513b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,15 +7954,16 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task, NULL);
+		      rctx, task);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task, struct perf_event *event)
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
+	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7976,15 +7977,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	perf_trace_buf_update(record, event_type);
 
-	/* Use the given event instead of the hlist */
-	if (event) {
+	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
-	} else {
-		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
-				perf_swevent_event(event, count, &data, regs);
-		}
 	}
 
 	/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..562fa69df5d3 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,7 +306,6 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
-	struct perf_event *event;
 	struct ftrace_entry *entry;
 	struct hlist_head *head;
 	struct pt_regs regs;
@@ -330,9 +329,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL, event);
+			      1, &regs, head, NULL);
 
 #undef ENTRY_SIZE
 }
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index af6134f2e597..996902a526d4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..934b0da72679 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -622,7 +622,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -716,7 +716,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	}
 
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL, NULL);
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b34965e62fb9..402120ba4594 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
  out:
 	preempt_enable();
 }
-- 
cgit v1.3-14-g43fede


From 466c81c45b650deca213fda3d0ec4761667379a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 10 Oct 2017 17:15:47 +0200
Subject: perf/ftrace: Fix function trace events

The function-trace <-> perf interface is a tad messed up. Where all
the other trace <-> perf interfaces use a single trace hook
registration and use per-cpu RCU based hlist to iterate the events,
function-trace actually needs multiple hook registrations in order to
minimize function entry patching when filters are present.

The end result is that we iterate events both on the trace hook and on
the hlist, which results in reporting events multiple times.

Since function-trace cannot use the regular scheme, fix it the other
way around, use singleton hlists.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/trace_events.h    |  5 +++
 kernel/trace/trace_event_perf.c | 80 +++++++++++++++++++++++++----------------
 2 files changed, 54 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index a6349b76fd39..ca4e67e466a7 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -173,6 +173,11 @@ enum trace_reg {
 	TRACE_REG_PERF_UNREGISTER,
 	TRACE_REG_PERF_OPEN,
 	TRACE_REG_PERF_CLOSE,
+	/*
+	 * These (ADD/DEL) use a 'boolean' return value, where 1 (true) means a
+	 * custom action was taken and the default action is not to be
+	 * performed.
+	 */
 	TRACE_REG_PERF_ADD,
 	TRACE_REG_PERF_DEL,
 #endif
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..e73f9ab15939 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
 
 	if (!(flags & PERF_EF_START))
 		p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
+
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -307,14 +321,24 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,7 +354,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -339,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -352,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -377,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;
-- 
cgit v1.3-14-g43fede


From 1dd311e6dcda4020c603bcf9f390a577d439d509 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:31 +0200
Subject: perf/ftrace: Small cleanup

ops->flags _should_ be 0 at this point, so setting the flag using
bitwise or is a bit daft.

Link: http://lkml.kernel.org/r/20171011080224.315585202@infradead.org

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 kernel/trace/trace_event_perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e73f9ab15939..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -363,7 +363,7 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags   |= FTRACE_OPS_FL_RCU;
+	ops->flags   = FTRACE_OPS_FL_RCU;
 	ops->func    = perf_ftrace_function_call;
 	ops->private = (void *)(unsigned long)nr_cpu_ids;
 
-- 
cgit v1.3-14-g43fede


From b3a88803ac5b4bda26017b485c8722a8487fefb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Oct 2017 09:45:32 +0200
Subject: ftrace: Kill FTRACE_OPS_FL_PER_CPU

The one and only user of FTRACE_OPS_FL_PER_CPU is gone, remove the
lot.

Link: http://lkml.kernel.org/r/20171011080224.372422809@infradead.org

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 83 +++++++++-----------------------------------------
 kernel/trace/ftrace.c  | 55 ++++-----------------------------
 2 files changed, 20 insertions(+), 118 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f8545caa691..252e334e7b5f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -102,10 +102,6 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
  * ENABLED - set/unset when ftrace_ops is registered/unregistered
  * DYNAMIC - set when ftrace_ops is registered to denote dynamically
  *           allocated ftrace_ops which need special care
- * PER_CPU - set manualy by ftrace_ops user to denote the ftrace_ops
- *           could be controlled by following calls:
- *             ftrace_function_local_enable
- *             ftrace_function_local_disable
  * SAVE_REGS - The ftrace_ops wants regs saved at each function called
  *            and passed to the callback. If this flag is set, but the
  *            architecture does not support passing regs
@@ -149,21 +145,20 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops);
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
 	FTRACE_OPS_FL_DYNAMIC			= 1 << 1,
-	FTRACE_OPS_FL_PER_CPU			= 1 << 2,
-	FTRACE_OPS_FL_SAVE_REGS			= 1 << 3,
-	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 4,
-	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 5,
-	FTRACE_OPS_FL_STUB			= 1 << 6,
-	FTRACE_OPS_FL_INITIALIZED		= 1 << 7,
-	FTRACE_OPS_FL_DELETED			= 1 << 8,
-	FTRACE_OPS_FL_ADDING			= 1 << 9,
-	FTRACE_OPS_FL_REMOVING			= 1 << 10,
-	FTRACE_OPS_FL_MODIFYING			= 1 << 11,
-	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 12,
-	FTRACE_OPS_FL_IPMODIFY			= 1 << 13,
-	FTRACE_OPS_FL_PID			= 1 << 14,
-	FTRACE_OPS_FL_RCU			= 1 << 15,
-	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 16,
+	FTRACE_OPS_FL_SAVE_REGS			= 1 << 2,
+	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 3,
+	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 4,
+	FTRACE_OPS_FL_STUB			= 1 << 5,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 6,
+	FTRACE_OPS_FL_DELETED			= 1 << 7,
+	FTRACE_OPS_FL_ADDING			= 1 << 8,
+	FTRACE_OPS_FL_REMOVING			= 1 << 9,
+	FTRACE_OPS_FL_MODIFYING			= 1 << 10,
+	FTRACE_OPS_FL_ALLOC_TRAMP		= 1 << 11,
+	FTRACE_OPS_FL_IPMODIFY			= 1 << 12,
+	FTRACE_OPS_FL_PID			= 1 << 13,
+	FTRACE_OPS_FL_RCU			= 1 << 14,
+	FTRACE_OPS_FL_TRACE_ARRAY		= 1 << 15,
 };
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -198,7 +193,6 @@ struct ftrace_ops {
 	unsigned long			flags;
 	void				*private;
 	ftrace_func_t			saved_func;
-	int __percpu			*disabled;
 #ifdef CONFIG_DYNAMIC_FTRACE
 	struct ftrace_ops_hash		local_hash;
 	struct ftrace_ops_hash		*func_hash;
@@ -230,55 +224,6 @@ int register_ftrace_function(struct ftrace_ops *ops);
 int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
-/**
- * ftrace_function_local_enable - enable ftrace_ops on current cpu
- *
- * This function enables tracing on current cpu by decreasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_enable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))--;
-}
-
-/**
- * ftrace_function_local_disable - disable ftrace_ops on current cpu
- *
- * This function disables tracing on current cpu by increasing
- * the per cpu control variable.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline void ftrace_function_local_disable(struct ftrace_ops *ops)
-{
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return;
-
-	(*this_cpu_ptr(ops->disabled))++;
-}
-
-/**
- * ftrace_function_local_disabled - returns ftrace_ops disabled value
- *                                  on current cpu
- *
- * This function returns value of ftrace_ops::disabled on current cpu.
- * It must be called with preemption disabled and only on ftrace_ops
- * registered with FTRACE_OPS_FL_PER_CPU. If called without preemption
- * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled.
- */
-static inline int ftrace_function_local_disabled(struct ftrace_ops *ops)
-{
-	WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU));
-	return *this_cpu_ptr(ops->disabled);
-}
-
 extern void ftrace_stub(unsigned long a0, unsigned long a1,
 			struct ftrace_ops *op, struct pt_regs *regs);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e0a98225666b..2fd3edaec6de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
-			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+	    FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -6355,10 +6318,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6416,10 +6376,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6443,7 +6400,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & FTRACE_OPS_FL_RCU)
 		return ftrace_ops_assist_func;
 
 	return ops->func;
-- 
cgit v1.3-14-g43fede


From fe460423438b62eb7440d994ab19a9f444e6280d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 20:29:38 +0200
Subject: posix-stubs: Use get_timespec64() and put_timespec64()

This is a follow-up to commit 5c4994102fb5 ("posix-timers: Use
get_timespec64() and put_timespec64()"), which left two system call using
copy_from_user()/copy_to_user().

Change them as well for consistency.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: y2038@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Link: https://lkml.kernel.org/r/20171013183009.3442318-1-arnd@arndb.de
---
 kernel/time/posix-stubs.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 06f34feb635e..b258bee13b02 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -117,8 +117,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		const struct timespec __user *, rqtp,
 		struct timespec __user *, rmtp)
 {
-	struct timespec64 t64;
-	struct timespec t;
+	struct timespec64 t;
 
 	switch (which_clock) {
 	case CLOCK_REALTIME:
@@ -129,16 +128,15 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 		return -EINVAL;
 	}
 
-	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+	if (get_timespec64(&t, rqtp))
 		return -EFAULT;
-	t64 = timespec_to_timespec64(t);
-	if (!timespec64_valid(&t64))
+	if (!timespec64_valid(&t))
 		return -EINVAL;
 	if (flags & TIMER_ABSTIME)
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
 	current->restart_block.nanosleep.rmtp = rmtp;
-	return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+	return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 				 which_clock);
 }
@@ -203,8 +201,7 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		       struct compat_timespec __user *, rqtp,
 		       struct compat_timespec __user *, rmtp)
 {
-	struct timespec64 t64;
-	struct timespec t;
+	struct timespec64 t;
 
 	switch (which_clock) {
 	case CLOCK_REALTIME:
@@ -215,16 +212,15 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
 		return -EINVAL;
 	}
 
-	if (compat_get_timespec(&t, rqtp))
+	if (compat_get_timespec64(&t, rqtp))
 		return -EFAULT;
-	t64 = timespec_to_timespec64(t);
-	if (!timespec64_valid(&t64))
+	if (!timespec64_valid(&t))
 		return -EINVAL;
 	if (flags & TIMER_ABSTIME)
 		rmtp = NULL;
 	current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
 	current->restart_block.nanosleep.compat_rmtp = rmtp;
-	return hrtimer_nanosleep(&t64, flags & TIMER_ABSTIME ?
+	return hrtimer_nanosleep(&t, flags & TIMER_ABSTIME ?
 				 HRTIMER_MODE_ABS : HRTIMER_MODE_REL,
 				 which_clock);
 }
-- 
cgit v1.3-14-g43fede


From 4eb1bca1793385b8caff4b2e1f19b31a013dd1ec Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 13 Oct 2017 20:34:35 +0200
Subject: time: Use do_settimeofday64() internally

do_settimeofday() is a wrapper around do_settimeofday64(), so that function
can be called directly. The wrapper can be removed once the last user is
gone.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: y2038@lists.linaro.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Deepa Dinamani <deepa.kernel@gmail.com>
Link: https://lkml.kernel.org/r/20171013183452.3635956-1-arnd@arndb.de
---
 kernel/time/time.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index 44a8c1402133..cfe3d3e4679f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE1(time, time_t __user *, tloc)
 
 SYSCALL_DEFINE1(stime, time_t __user *, tptr)
 {
-	struct timespec tv;
+	struct timespec64 tv;
 	int err;
 
 	if (get_user(tv.tv_sec, tptr))
@@ -90,11 +90,11 @@ SYSCALL_DEFINE1(stime, time_t __user *, tptr)
 
 	tv.tv_nsec = 0;
 
-	err = security_settime(&tv, NULL);
+	err = security_settime64(&tv, NULL);
 	if (err)
 		return err;
 
-	do_settimeofday(&tv);
+	do_settimeofday64(&tv);
 	return 0;
 }
 
@@ -122,7 +122,7 @@ COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
 
 COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
 {
-	struct timespec tv;
+	struct timespec64 tv;
 	int err;
 
 	if (get_user(tv.tv_sec, tptr))
@@ -130,11 +130,11 @@ COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
 
 	tv.tv_nsec = 0;
 
-	err = security_settime(&tv, NULL);
+	err = security_settime64(&tv, NULL);
 	if (err)
 		return err;
 
-	do_settimeofday(&tv);
+	do_settimeofday64(&tv);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 6710e1126934d8b4372b4d2f9ae1646cd3f151bf Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:28 +0200
Subject: bpf: introduce new bpf cpu map type BPF_MAP_TYPE_CPUMAP

The 'cpumap' is primarily used as a backend map for XDP BPF helper
call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.

This patch implement the main part of the map.  It is not connected to
the XDP redirect system yet, and no SKB allocation are done yet.

The main concern in this patch is to ensure the datapath can run
without any locking.  This adds complexity to the setup and tear-down
procedure, which assumptions are extra carefully documented in the
code comments.

V2:
 - make sure array isn't larger than NR_CPUS
 - make sure CPUs added is a valid possible CPU

V3: fix nitpicks from Jakub Kicinski <kubakici@wp.pl>

V5:
 - Restrict map allocation to root / CAP_SYS_ADMIN
 - WARN_ON_ONCE if queue is not empty on tear-down
 - Return -EPERM on memlock limit instead of -ENOMEM
 - Error code in __cpu_map_entry_alloc() also handle ptr_ring_cleanup()
 - Moved cpu_map_enqueue() to next patch

V6: all notice by Daniel Borkmann
 - Fix err return code in cpu_map_alloc() introduced in V5
 - Move cpu_possible() check after max_entries boundary check
 - Forbid usage initially in check_map_func_compatibility()

V7:
 - Fix alloc error path spotted by Daniel Borkmann
 - Did stress test adding+removing CPUs from the map concurrently
 - Fixed refcnt issue on cpu_map_entry, kthread started too soon
 - Make sure packets are flushed during tear-down, involved use of
   rcu_barrier() and kthread_run only exit after queue is empty
 - Fix alloc error path in __cpu_map_entry_alloc() for ptr_ring

V8:
 - Nitpicking comments and gramma by Edward Cree
 - Fix missing semi-colon introduced in V7 due to rebasing
 - Move struct bpf_cpu_map_entry members cpu+map_id to tracepoint patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_types.h      |   1 +
 include/uapi/linux/bpf.h       |   1 +
 kernel/bpf/Makefile            |   1 +
 kernel/bpf/cpumap.c            | 560 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |   8 +-
 kernel/bpf/verifier.c          |   5 +
 tools/include/uapi/linux/bpf.h |   1 +
 7 files changed, 576 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/cpumap.c

(limited to 'kernel')

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 6f1a567667b8..814c1081a4a9 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -41,4 +41,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
 #ifdef CONFIG_STREAM_PARSER
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops)
 #endif
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 6db9e1d679cd..4303fb6c3817 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 53fb09f92e3f..e597daae6120 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..e1e25ddba038
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,560 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		struct xdp_pkt *xdp_pkt;
+
+		schedule();
+		/* Do work */
+		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
+			/* For now just "refcnt-free" */
+			page_frag_free(xdp_pkt);
+		}
+		__set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->qsize = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			/* Free xdp_pkt */
+			page_frag_free(xdp_pkt);
+		}
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	return 0;
+}
+
+/* Notice: Will change in later patch */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+};
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d124e702e040..54fba06942f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -592,6 +592,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -622,7 +628,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9755279d94cb..cefa64be9a2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1444,6 +1444,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index fb4fb81ce5b0..fa93033dc521 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -112,6 +112,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_HASH_OF_MAPS,
 	BPF_MAP_TYPE_DEVMAP,
 	BPF_MAP_TYPE_SOCKMAP,
+	BPF_MAP_TYPE_CPUMAP,
 };
 
 enum bpf_prog_type {
-- 
cgit v1.3-14-g43fede


From 9c270af37bb62e708e3e4415d653ce73e713df02 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:34 +0200
Subject: bpf: XDP_REDIRECT enable use of cpumap

This patch connects cpumap to the xdp_do_redirect_map infrastructure.

Still no SKB allocation are done yet.  The XDP frames are transferred
to the other CPU, but they are simply refcnt decremented on the remote
CPU.  This served as a good benchmark for measuring the overhead of
remote refcnt decrement.  If driver page recycle cache is not
efficient then this, exposes a bottleneck in the page allocator.

A shout-out to MST's ptr_ring, which is the secret behind is being so
efficient to transfer memory pointers between CPUs, without constantly
bouncing cache-lines between CPUs.

V3: Handle !CONFIG_BPF_SYSCALL pointed out by kbuild test robot.

V4: Make Generic-XDP aware of cpumap type, but don't allow redirect yet,
 as implementation require a separate upstream discussion.

V5:
 - Fix a maybe-uninitialized pointed out by kbuild test robot.
 - Restrict bpf-prog side access to cpumap, open when use-cases appear
 - Implement cpu_map_enqueue() as a more simple void pointer enqueue

V6:
 - Allow cpumap type for usage in helper bpf_redirect_map,
   general bpf-prog side restriction moved to earlier patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h        |  31 +++++++++-
 include/trace/events/xdp.h |  10 +++-
 kernel/bpf/cpumap.c        |  22 ++++++-
 kernel/bpf/verifier.c      |   3 +-
 net/core/filter.c          | 140 +++++++++++++++++++++++++++++++++++----------
 5 files changed, 172 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4373125de1f3..6d4dd844828a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -355,6 +355,13 @@ struct net_device  *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
 void __dev_map_flush(struct bpf_map *map);
 
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
+void __cpu_map_flush(struct bpf_map *map);
+struct xdp_buff;
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx);
+
 /* Return map's numa specified by userspace */
 static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 {
@@ -362,7 +369,7 @@ static inline int bpf_map_attr_numa_node(const union bpf_attr *attr)
 		attr->numa_node : NUMA_NO_NODE;
 }
 
-#else
+#else /* !CONFIG_BPF_SYSCALL */
 static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 {
 	return ERR_PTR(-EOPNOTSUPP);
@@ -425,6 +432,28 @@ static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index)
 static inline void __dev_map_flush(struct bpf_map *map)
 {
 }
+
+static inline
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	return NULL;
+}
+
+static inline void __cpu_map_insert_ctx(struct bpf_map *map, u32 index)
+{
+}
+
+static inline void __cpu_map_flush(struct bpf_map *map)
+{
+}
+
+struct xdp_buff;
+static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_buff *xdp,
+				  struct net_device *dev_rx)
+{
+	return 0;
+}
 #endif /* CONFIG_BPF_SYSCALL */
 
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 4e16c43fba10..eb2ece96c1a2 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -136,12 +136,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 		  __entry->map_id, __entry->map_index)
 );
 
+#define devmap_ifindex(fwd, map)				\
+	(!fwd ? 0 :						\
+	 (!map ? 0 :						\
+	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	   ((struct net_device *)fwd)->ifindex : 0)))
+
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
-	 trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
 				0, map, idx)
 
 #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err)	\
-	 trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0,	\
+	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
 #endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index e1e25ddba038..768da6a2c265 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -500,7 +500,7 @@ struct xdp_pkt {
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -520,6 +520,26 @@ int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	return 0;
 }
 
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+	int headroom;
+
+	/* For now this is just used as a void pointer to data_hard_start.
+	 * Followup patch will generalize this.
+	 */
+	xdp_pkt = xdp->data_hard_start;
+
+	/* Fake writing into xdp_pkt->data to measure overhead */
+	headroom = xdp->data - xdp->data_hard_start;
+	if (headroom < sizeof(*xdp_pkt))
+		xdp_pkt->data = xdp->data;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
 void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
 {
 	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cefa64be9a2f..e4d5136725a2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1486,7 +1486,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
diff --git a/net/core/filter.c b/net/core/filter.c
index 140fa9f9c0f4..4d88e0665c41 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2526,10 +2526,36 @@ static int __bpf_tx_xdp(struct net_device *dev,
 	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
 	if (err)
 		return err;
-	if (map)
+	dev->netdev_ops->ndo_xdp_flush(dev);
+	return 0;
+}
+
+static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
+			    struct bpf_map *map,
+			    struct xdp_buff *xdp,
+			    u32 index)
+{
+	int err;
+
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		struct net_device *dev = fwd;
+
+		if (!dev->netdev_ops->ndo_xdp_xmit)
+			return -EOPNOTSUPP;
+
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		if (err)
+			return err;
 		__dev_map_insert_ctx(map, index);
-	else
-		dev->netdev_ops->ndo_xdp_flush(dev);
+
+	} else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		struct bpf_cpu_map_entry *rcpu = fwd;
+
+		err = cpu_map_enqueue(rcpu, xdp, dev_rx);
+		if (err)
+			return err;
+		__cpu_map_insert_ctx(map, index);
+	}
 	return 0;
 }
 
@@ -2539,11 +2565,33 @@ void xdp_do_flush_map(void)
 	struct bpf_map *map = ri->map_to_flush;
 
 	ri->map_to_flush = NULL;
-	if (map)
-		__dev_map_flush(map);
+	if (map) {
+		switch (map->map_type) {
+		case BPF_MAP_TYPE_DEVMAP:
+			__dev_map_flush(map);
+			break;
+		case BPF_MAP_TYPE_CPUMAP:
+			__cpu_map_flush(map);
+			break;
+		default:
+			break;
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(xdp_do_flush_map);
 
+static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_DEVMAP:
+		return __dev_map_lookup_elem(map, index);
+	case BPF_MAP_TYPE_CPUMAP:
+		return __cpu_map_lookup_elem(map, index);
+	default:
+		return NULL;
+	}
+}
+
 static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
 				   unsigned long aux)
 {
@@ -2556,8 +2604,8 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
-	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
+	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
@@ -2570,7 +2618,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 		goto err;
 	}
 
-	fwd = __dev_map_lookup_elem(map, index);
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
 		err = -EINVAL;
 		goto err;
@@ -2578,7 +2626,7 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
 	if (ri->map_to_flush && ri->map_to_flush != map)
 		xdp_do_flush_map();
 
-	err = __bpf_tx_xdp(fwd, map, xdp, index);
+	err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
 	if (unlikely(err))
 		goto err;
 
@@ -2620,54 +2668,88 @@ err:
 }
 EXPORT_SYMBOL_GPL(xdp_do_redirect);
 
-int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
-			    struct bpf_prog *xdp_prog)
+static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd)
+{
+	unsigned int len;
+
+	if (unlikely(!(fwd->flags & IFF_UP)))
+		return -ENETDOWN;
+
+	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
+	if (skb->len > len)
+		return -EMSGSIZE;
+
+	return 0;
+}
+
+int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb,
+				struct bpf_prog *xdp_prog)
 {
 	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
 	unsigned long map_owner = ri->map_owner;
 	struct bpf_map *map = ri->map;
 	struct net_device *fwd = NULL;
 	u32 index = ri->ifindex;
-	unsigned int len;
 	int err = 0;
 
 	ri->ifindex = 0;
 	ri->map = NULL;
 	ri->map_owner = 0;
 
-	if (map) {
-		if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-			err = -EFAULT;
-			map = NULL;
-			goto err;
-		}
-		fwd = __dev_map_lookup_elem(map, index);
-	} else {
-		fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
+		err = -EFAULT;
+		map = NULL;
+		goto err;
 	}
+	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
 		goto err;
 	}
 
-	if (unlikely(!(fwd->flags & IFF_UP))) {
-		err = -ENETDOWN;
+	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+		if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+			goto err;
+		skb->dev = fwd;
+	} else {
+		/* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+		err = -EBADRQC;
 		goto err;
 	}
 
-	len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
-	if (skb->len > len) {
-		err = -EMSGSIZE;
+	_trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+	return 0;
+err:
+	_trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+	return err;
+}
+
+int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
+			    struct bpf_prog *xdp_prog)
+{
+	struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+	u32 index = ri->ifindex;
+	struct net_device *fwd;
+	int err = 0;
+
+	if (ri->map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp_prog);
+
+	ri->ifindex = 0;
+	fwd = dev_get_by_index_rcu(dev_net(dev), index);
+	if (unlikely(!fwd)) {
+		err = -EINVAL;
 		goto err;
 	}
 
+	if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+		goto err;
+
 	skb->dev = fwd;
-	map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index)
-		: _trace_xdp_redirect(dev, xdp_prog, index);
+	_trace_xdp_redirect(dev, xdp_prog, index);
 	return 0;
 err:
-	map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err)
-		: _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+	_trace_xdp_redirect_err(dev, xdp_prog, index, err);
 	return err;
 }
 EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
-- 
cgit v1.3-14-g43fede


From 1c601d829ab0d7ac3ac44853f83db2206afe67fc Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:39 +0200
Subject: bpf: cpumap xdp_buff to skb conversion and allocation

This patch makes cpumap functional, by adding SKB allocation and
invoking the network stack on the dequeuing CPU.

For constructing the SKB on the remote CPU, the xdp_buff in converted
into a struct xdp_pkt, and it mapped into the top headroom of the
packet, to avoid allocating separate mem.  For now, struct xdp_pkt is
just a cpumap internal data structure, with info carried between
enqueue to dequeue.

If a driver doesn't have enough headroom it is simply dropped, with
return code -EOVERFLOW.  This will be picked up the xdp tracepoint
infrastructure, to allow users to catch this.

V2: take into account xdp->data_meta

V4:
 - Drop busypoll tricks, keeping it more simple.
 - Skip RPS and Generic-XDP-recursive-reinjection, suggested by Alexei

V5: correct RCU read protection around __netif_receive_skb_core.

V6: Setting TASK_RUNNING vs TASK_INTERRUPTIBLE based on talk with Rik van Riel

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   1 +
 kernel/bpf/cpumap.c       | 152 +++++++++++++++++++++++++++++++++++++++-------
 net/core/dev.c            |  27 ++++++++
 3 files changed, 158 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31bb3010c69b..bf014afcb914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3260,6 +3260,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
+int netif_receive_skb_core(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 768da6a2c265..ee7adf4352dd 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -25,6 +25,9 @@
 #include <linux/kthread.h>
 #include <linux/capability.h>
 
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
 /* General idea: XDP packets getting XDP redirected to another CPU,
  * will maximum be stored/queued for one driver ->poll() call.  It is
  * guaranteed that setting flush bit and flush operation happen on
@@ -179,6 +182,92 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if ((headroom - metasize) < sizeof(*xdp_pkt))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -191,15 +280,45 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		unsigned int processed = 0, drops = 0;
 		struct xdp_pkt *xdp_pkt;
 
-		schedule();
-		/* Do work */
-		while ((xdp_pkt = ptr_ring_consume(rcpu->queue))) {
-			/* For now just "refcnt-free" */
-			page_frag_free(xdp_pkt);
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			__set_current_state(TASK_INTERRUPTIBLE);
+			schedule();
+		} else {
+			cond_resched();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
 		}
-		__set_current_state(TASK_INTERRUPTIBLE);
+		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
 
@@ -490,13 +609,6 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	return 0;
 }
 
-/* Notice: Will change in later patch */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-};
-
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
@@ -524,17 +636,13 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
 	struct xdp_pkt *xdp_pkt;
-	int headroom;
 
-	/* For now this is just used as a void pointer to data_hard_start.
-	 * Followup patch will generalize this.
-	 */
-	xdp_pkt = xdp->data_hard_start;
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (!xdp_pkt)
+		return -EOVERFLOW;
 
-	/* Fake writing into xdp_pkt->data to measure overhead */
-	headroom = xdp->data - xdp->data_hard_start;
-	if (headroom < sizeof(*xdp_pkt))
-		xdp_pkt->data = xdp->data;
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
 
 	bq_enqueue(rcpu, xdp_pkt);
 	return 0;
diff --git a/net/core/dev.c b/net/core/dev.c
index d2b20e73080e..cf5894f0e6eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4492,6 +4492,33 @@ out:
 	return ret;
 }
 
+/**
+ *	netif_receive_skb_core - special purpose version of netif_receive_skb
+ *	@skb: buffer to process
+ *
+ *	More direct receive version of netif_receive_skb().  It should
+ *	only be used by callers that have a need to skip RPS and Generic XDP.
+ *	Caller must also take care of handling if (page_is_)pfmemalloc.
+ *
+ *	This function may only be called from softirq context and interrupts
+ *	should be enabled.
+ *
+ *	Return values (usually ignored):
+ *	NET_RX_SUCCESS: no congestion
+ *	NET_RX_DROP: packet was dropped
+ */
+int netif_receive_skb_core(struct sk_buff *skb)
+{
+	int ret;
+
+	rcu_read_lock();
+	ret = __netif_receive_skb_core(skb, false);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(netif_receive_skb_core);
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	int ret;
-- 
cgit v1.3-14-g43fede


From f9419f7bd7a5318b636a941a0214c5cdfa6f6530 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 16 Oct 2017 12:19:44 +0200
Subject: bpf: cpumap add tracepoints

This adds two tracepoint to the cpumap.  One for the enqueue side
trace_xdp_cpumap_enqueue() and one for the kthread dequeue side
trace_xdp_cpumap_kthread().

To mitigate the tracepoint overhead, these are invoked during the
enqueue/dequeue bulking phases, thus amortizing the cost.

The obvious use-cases are for debugging and monitoring.  The
non-intuitive use-case is using these as a feedback loop to know the
system load.  One can imagine auto-scaling by reducing, adding or
activating more worker CPUs on demand.

V4: tracepoint remove time_limit info, instead add sched info

V8: intro struct bpf_cpu_map_entry members cpu+map_id in this patch

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/trace/events/xdp.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/cpumap.c        | 24 ++++++++++++----
 2 files changed, 89 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index eb2ece96c1a2..0c8dec61987e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -150,6 +150,76 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
 	 trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map),	\
 				    err, map, idx)
 
+TRACE_EVENT(xdp_cpumap_kthread,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int sched),
+
+	TP_ARGS(map_id, processed, drops, sched),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, sched)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->sched	= sched;
+	),
+
+	TP_printk("kthread"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " sched=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->sched)
+);
+
+TRACE_EVENT(xdp_cpumap_enqueue,
+
+	TP_PROTO(int map_id, unsigned int processed,  unsigned int drops,
+		 int to_cpu),
+
+	TP_ARGS(map_id, processed, drops, to_cpu),
+
+	TP_STRUCT__entry(
+		__field(int, map_id)
+		__field(u32, act)
+		__field(int, cpu)
+		__field(unsigned int, drops)
+		__field(unsigned int, processed)
+		__field(int, to_cpu)
+	),
+
+	TP_fast_assign(
+		__entry->map_id		= map_id;
+		__entry->act		= XDP_REDIRECT;
+		__entry->cpu		= smp_processor_id();
+		__entry->drops		= drops;
+		__entry->processed	= processed;
+		__entry->to_cpu		= to_cpu;
+	),
+
+	TP_printk("enqueue"
+		  " cpu=%d map_id=%d action=%s"
+		  " processed=%u drops=%u"
+		  " to_cpu=%d",
+		  __entry->cpu, __entry->map_id,
+		  __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+		  __entry->processed, __entry->drops,
+		  __entry->to_cpu)
+);
+
 #endif /* _TRACE_XDP_H */
 
 #include <trace/define_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ee7adf4352dd..b4358d84ddf1 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/capability.h>
+#include <trace/events/xdp.h>
 
 #include <linux/netdevice.h>   /* netif_receive_skb_core */
 #include <linux/etherdevice.h> /* eth_type_trans */
@@ -43,6 +44,8 @@ struct xdp_bulk_queue {
 
 /* Struct for every remote "destination" CPU in map */
 struct bpf_cpu_map_entry {
+	u32 cpu;    /* kthread CPU and map index */
+	int map_id; /* Back reference to map */
 	u32 qsize;  /* Queue size placeholder for map lookup */
 
 	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
@@ -280,15 +283,16 @@ static int cpu_map_kthread_run(void *data)
 	 * kthread_stop signal until queue is empty.
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
-		unsigned int processed = 0, drops = 0;
+		unsigned int processed = 0, drops = 0, sched = 0;
 		struct xdp_pkt *xdp_pkt;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule();
+			sched = 1;
 		} else {
-			cond_resched();
+			sched = cond_resched();
 		}
 		__set_current_state(TASK_RUNNING);
 
@@ -318,6 +322,9 @@ static int cpu_map_kthread_run(void *data)
 			if (++processed == 8)
 				break;
 		}
+		/* Feedback loop via tracepoint */
+		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
 		local_bh_enable(); /* resched point, may call do_softirq() */
 	}
 	__set_current_state(TASK_RUNNING);
@@ -354,7 +361,9 @@ struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
 	if (err)
 		goto free_queue;
 
-	rcpu->qsize = qsize;
+	rcpu->cpu    = cpu;
+	rcpu->map_id = map_id;
+	rcpu->qsize  = qsize;
 
 	/* Setup kthread */
 	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
@@ -584,6 +593,8 @@ const struct bpf_map_ops cpu_map_ops = {
 static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 			     struct xdp_bulk_queue *bq)
 {
+	unsigned int processed = 0, drops = 0;
+	const int to_cpu = rcpu->cpu;
 	struct ptr_ring *q;
 	int i;
 
@@ -599,13 +610,16 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 
 		err = __ptr_ring_produce(q, xdp_pkt);
 		if (err) {
-			/* Free xdp_pkt */
-			page_frag_free(xdp_pkt);
+			drops++;
+			page_frag_free(xdp_pkt); /* Free xdp_pkt */
 		}
+		processed++;
 	}
 	bq->count = 0;
 	spin_unlock(&q->producer_lock);
 
+	/* Feedback loop via tracepoints */
+	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 5ffeb0501c6b36d080de78372fdb70b404b91e9d Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 25 Jul 2016 16:07:10 +0100
Subject: genirq: export irq_get_percpu_devid_partition to modules

Any modular driver using cluster-affine PPIs needs to be able to call
irq_get_percpu_devid_partition so that it can enable the IRQ on the
correct subset of CPUs.

This patch exports the symbol so that it can be called from within a
module.

Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/irq/irqdesc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7ed369f..694c1a9d6485 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -863,6 +863,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
 
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
-- 
cgit v1.3-14-g43fede


From bc1d202023eb66f088f736ba423bee1cf135c720 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 16 Aug 2016 16:53:15 +0100
Subject: perf/core: Export AUX buffer helpers to modules

Perf PMU drivers using AUX buffers cannot be built as modules unless
the AUX helpers are exported.

This patch exports perf_aux_output_{begin,end,skip} and perf_get_aux to
modules.

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 kernel/events/ring_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..6d9bffe4d6cc 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -411,6 +411,7 @@ err:
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
 
 static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
 {
@@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 	rb_free_aux(rb);
 	ring_buffer_put(rb);
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
 
 /*
  * Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
 
 void *perf_get_aux(struct perf_output_handle *handle)
 {
@@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
 
 	return handle->rb->aux_priv;
 }
+EXPORT_SYMBOL_GPL(perf_get_aux);
 
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
-- 
cgit v1.3-14-g43fede


From 7de16e3a35578f4f5accc6f5f23970310483d0a2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:53 -0700
Subject: bpf: split verifier and program ops

struct bpf_verifier_ops contains both verifier ops and operations
used later during program's lifetime (test_run).  Split the runtime
ops into a different structure.

BPF_PROG_TYPE() will now append ## _prog_ops or ## _verifier_ops
to the names.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       | 15 ++++++++++-----
 include/linux/bpf_types.h | 28 ++++++++++++++--------------
 kernel/bpf/syscall.c      | 16 +++++++++++++---
 kernel/bpf/verifier.c     | 12 ++++++------
 kernel/trace/bpf_trace.c  | 15 ++++++++++++---
 net/core/filter.c         | 45 ++++++++++++++++++++++++++++++++++++---------
 6 files changed, 91 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6d4dd844828a..e1fba5504ca5 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -157,6 +157,11 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
 	aux->ctx_field_size = size;
 }
 
+struct bpf_prog_ops {
+	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
+			union bpf_attr __user *uattr);
+};
+
 struct bpf_verifier_ops {
 	/* return eBPF function prototype for verification */
 	const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id);
@@ -172,8 +177,6 @@ struct bpf_verifier_ops {
 				  const struct bpf_insn *src,
 				  struct bpf_insn *dst,
 				  struct bpf_prog *prog, u32 *target_size);
-	int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr,
-			union bpf_attr __user *uattr);
 };
 
 struct bpf_prog_aux {
@@ -184,7 +187,8 @@ struct bpf_prog_aux {
 	u32 id;
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
-	const struct bpf_verifier_ops *ops;
+	const struct bpf_prog_ops *ops;
+	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
@@ -279,8 +283,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
-#define BPF_PROG_TYPE(_id, _ops) \
-	extern const struct bpf_verifier_ops _ops;
+#define BPF_PROG_TYPE(_id, _name) \
+	extern const struct bpf_prog_ops _name ## _prog_ops; \
+	extern const struct bpf_verifier_ops _name ## _verifier_ops;
 #define BPF_MAP_TYPE(_id, _ops) \
 	extern const struct bpf_map_ops _ops;
 #include <linux/bpf_types.h>
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 814c1081a4a9..36418ad43245 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -1,22 +1,22 @@
 /* internal file - do not include directly */
 
 #ifdef CONFIG_NET
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act)
+BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb)
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout)
+BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 #endif
 #ifdef CONFIG_BPF_EVENTS
-BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops)
-BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops)
+BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
+BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
+BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 54fba06942f5..444902b5a30d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -739,9 +739,18 @@ err_put:
 	return err;
 }
 
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
-	[_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _prog_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
@@ -754,6 +763,7 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
+	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e4d5136725a2..38e24d69fc95 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -856,8 +856,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->ops->is_valid_access &&
-		   env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	} else if (env->prog->aux->vops->is_valid_access &&
+		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1565,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->ops->get_func_proto)
-		fn = env->prog->aux->ops->get_func_proto(func_id);
+	if (env->prog->aux->vops->get_func_proto)
+		fn = env->prog->aux->vops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4035,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4236,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->ops->get_func_proto(insn->imm);
+		fn = prog->aux->vops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 04ea5314f2bc..3126da2f468a 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -561,11 +561,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -667,11 +670,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -727,8 +733,11 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
diff --git a/net/core/filter.c b/net/core/filter.c
index 4d88e0665c41..1dd3034f846f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4395,68 +4395,95 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops sk_filter_prog_ops = {
+const struct bpf_verifier_ops sk_filter_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops tc_cls_act_prog_ops = {
+const struct bpf_prog_ops sk_filter_prog_ops = {
+};
+
+const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
 	.convert_ctx_access	= tc_cls_act_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops xdp_prog_ops = {
+const struct bpf_verifier_ops xdp_verifier_ops = {
 	.get_func_proto		= xdp_func_proto,
 	.is_valid_access	= xdp_is_valid_access,
 	.convert_ctx_access	= xdp_convert_ctx_access,
+};
+
+const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
 
-const struct bpf_verifier_ops cg_skb_prog_ops = {
+const struct bpf_verifier_ops cg_skb_verifier_ops = {
 	.get_func_proto		= sk_filter_func_proto,
 	.is_valid_access	= sk_filter_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops cg_skb_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_inout_prog_ops = {
+const struct bpf_verifier_ops lwt_inout_verifier_ops = {
 	.get_func_proto		= lwt_inout_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
+};
+
+const struct bpf_prog_ops lwt_inout_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops lwt_xmit_prog_ops = {
+const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
 	.get_func_proto		= lwt_xmit_func_proto,
 	.is_valid_access	= lwt_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= tc_cls_act_prologue,
+};
+
+const struct bpf_prog_ops lwt_xmit_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
 
-const struct bpf_verifier_ops cg_sock_prog_ops = {
+const struct bpf_verifier_ops cg_sock_verifier_ops = {
 	.get_func_proto		= sock_filter_func_proto,
 	.is_valid_access	= sock_filter_is_valid_access,
 	.convert_ctx_access	= sock_filter_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sock_ops_prog_ops = {
+const struct bpf_prog_ops cg_sock_prog_ops = {
+};
+
+const struct bpf_verifier_ops sock_ops_verifier_ops = {
 	.get_func_proto		= sock_ops_func_proto,
 	.is_valid_access	= sock_ops_is_valid_access,
 	.convert_ctx_access	= sock_ops_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops sk_skb_prog_ops = {
+const struct bpf_prog_ops sock_ops_prog_ops = {
+};
+
+const struct bpf_verifier_ops sk_skb_verifier_ops = {
 	.get_func_proto		= sk_skb_func_proto,
 	.is_valid_access	= sk_skb_is_valid_access,
 	.convert_ctx_access	= bpf_convert_ctx_access,
 	.gen_prologue		= sk_skb_prologue,
 };
 
+const struct bpf_prog_ops sk_skb_prog_ops = {
+};
+
 int sk_detach_filter(struct sock *sk)
 {
 	int ret = -ENOENT;
-- 
cgit v1.3-14-g43fede


From 00176a34d9e27ab1e77db75fe13abc005cffe0ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:54 -0700
Subject: bpf: remove the verifier ops from program structure

Since the verifier ops don't have to be associated with
the program for its entire lifetime we can move it to
verifier's struct bpf_verifier_env.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  1 -
 include/linux/bpf_verifier.h |  1 +
 kernel/bpf/syscall.c         | 10 ----------
 kernel/bpf/verifier.c        | 23 +++++++++++++++++------
 4 files changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e1fba5504ca5..cf91977e8719 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -188,7 +188,6 @@ struct bpf_prog_aux {
 	struct latch_tree_node ksym_tnode;
 	struct list_head ksym_lnode;
 	const struct bpf_prog_ops *ops;
-	const struct bpf_verifier_ops *vops;
 	struct bpf_map **used_maps;
 	struct bpf_prog *prog;
 	struct user_struct *user;
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index f00ef751c1c5..feeaea93d959 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -141,6 +141,7 @@ struct bpf_ext_analyzer_ops {
  */
 struct bpf_verifier_env {
 	struct bpf_prog *prog;		/* eBPF program being verified */
+	const struct bpf_verifier_ops *ops;
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 444902b5a30d..0e893cac6795 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -748,22 +748,12 @@ static const struct bpf_prog_ops * const bpf_prog_types[] = {
 #undef BPF_MAP_TYPE
 };
 
-static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
-#define BPF_PROG_TYPE(_id, _name) \
-	[_id] = & _name ## _verifier_ops,
-#define BPF_MAP_TYPE(_id, _ops)
-#include <linux/bpf_types.h>
-#undef BPF_PROG_TYPE
-#undef BPF_MAP_TYPE
-};
-
 static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 {
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
 	prog->aux->ops = bpf_prog_types[type];
-	prog->aux->vops = bpf_verifier_ops[type];
 	prog->type = type;
 	return 0;
 }
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 38e24d69fc95..3b6e2c550e96 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -23,6 +23,15 @@
 
 #include "disasm.h"
 
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -856,8 +865,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 			*reg_type = info.reg_type;
 			return 0;
 		}
-	} else if (env->prog->aux->vops->is_valid_access &&
-		   env->prog->aux->vops->is_valid_access(off, size, t, &info)) {
+	} else if (env->ops->is_valid_access &&
+		   env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -1565,8 +1574,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->vops->get_func_proto)
-		fn = env->prog->aux->vops->get_func_proto(func_id);
+	if (env->ops->get_func_proto)
+		fn = env->ops->get_func_proto(func_id);
 
 	if (!fn) {
 		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
@@ -4035,7 +4044,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->vops;
+	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4236,7 +4245,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->vops->get_func_proto(insn->imm);
+		fn = env->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
@@ -4294,6 +4303,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = *prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -4406,6 +4416,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
-- 
cgit v1.3-14-g43fede


From 4f9218aaf8a463f76cac40aa08d859d065f8cc9e Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 16 Oct 2017 16:40:55 -0700
Subject: bpf: move knowledge about post-translation offsets out of verifier

Use the fact that verifier ops are now separate from program
ops to define a separate set of callbacks for verification of
already translated programs.

Since we expect the analyzer ops to be defined only for
a small subset of all program types initialize their array
by hand (don't use linux/bpf_types.h).

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  3 +++
 kernel/bpf/verifier.c | 55 +++++++++++++++------------------------------------
 net/core/filter.c     | 40 +++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index cf91977e8719..d67ccdc0099f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -291,6 +291,9 @@ DECLARE_PER_CPU(int, bpf_prog_active);
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
+extern const struct bpf_verifier_ops xdp_analyzer_ops;
+
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3b6e2c550e96..545b8c45a578 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -822,36 +822,6 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	return err;
 }
 
-static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off,
-				     struct bpf_insn_access_aux *info)
-{
-	switch (env->prog->type) {
-	case BPF_PROG_TYPE_XDP:
-		switch (off) {
-		case offsetof(struct xdp_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct xdp_buff, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	case BPF_PROG_TYPE_SCHED_CLS:
-		switch (off) {
-		case offsetof(struct sk_buff, data):
-			info->reg_type = PTR_TO_PACKET;
-			return true;
-		case offsetof(struct sk_buff, cb) +
-		     offsetof(struct bpf_skb_data_end, data_end):
-			info->reg_type = PTR_TO_PACKET_END;
-			return true;
-		}
-		return false;
-	default:
-		return false;
-	}
-}
-
 /* check access to 'struct bpf_context' fields.  Supports fixed offsets only */
 static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
 			    enum bpf_access_type t, enum bpf_reg_type *reg_type)
@@ -860,13 +830,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	if (env->analyzer_ops) {
-		if (analyzer_is_valid_access(env, off, &info)) {
-			*reg_type = info.reg_type;
-			return 0;
-		}
-	} else if (env->ops->is_valid_access &&
-		   env->ops->is_valid_access(off, size, t, &info)) {
+	if (env->ops->is_valid_access &&
+	    env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -874,9 +839,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 * will only allow for whole field access and rejects any other
 		 * type of narrower access.
 		 */
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
+		if (env->analyzer_ops)
+			return 0;
+
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
@@ -4400,12 +4368,21 @@ err_free_env:
 	return ret;
 }
 
+static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
+	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+};
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv)
 {
 	struct bpf_verifier_env *env;
 	int ret;
 
+	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
+	    !bpf_analyzer_ops[prog->type])
+		return -EOPNOTSUPP;
+
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
@@ -4416,7 +4393,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = prog;
-	env->ops = bpf_verifier_ops[env->prog->type];
+	env->ops = bpf_analyzer_ops[env->prog->type];
 	env->analyzer_ops = ops;
 	env->analyzer_priv = priv;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 1dd3034f846f..7373a08fbef7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3732,6 +3732,23 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
+static bool
+tc_cls_act_is_valid_access_analyzer(int off, int size,
+				    enum bpf_access_type type,
+				    struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct sk_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct sk_buff, cb) +
+	     offsetof(struct bpf_skb_data_end, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3766,6 +3783,21 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
+static bool xdp_is_valid_access_analyzer(int off, int size,
+					 enum bpf_access_type type,
+					 struct bpf_insn_access_aux *info)
+{
+	switch (off) {
+	case offsetof(struct xdp_buff, data):
+		info->reg_type = PTR_TO_PACKET;
+		return true;
+	case offsetof(struct xdp_buff, data_end):
+		info->reg_type = PTR_TO_PACKET_END;
+		return true;
+	}
+	return false;
+}
+
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4411,6 +4443,10 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
+const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
+	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4421,6 +4457,10 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
+const struct bpf_verifier_ops xdp_analyzer_ops = {
+	.is_valid_access	= xdp_is_valid_access_analyzer,
+};
+
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.3-14-g43fede


From 4f3a871443669c6b4d458a60ac8d8ca5eedc3f97 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Tue, 17 Oct 2017 13:48:34 +0530
Subject: Revert "kprobes: Warn if optprobe handler tries to change execution
 path"

This reverts commit:

  e863d539614641 ("kprobes: Warn if optprobe handler tries to change execution path")

On PowerPC, we place a probe at kretprobe_trampoline to catch function
returns and with CONFIG_OPTPROBES=y, this probe gets optimized. This
works for us due to the way we handle the optprobe as described in
commit:

  762df10bad6954 ("powerpc/kprobes: Optimize kprobe in kretprobe_trampoline()")

With the above commit, we end up with a warning. As such, revert this change.

Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171017081834.3629-1-naveen.n.rao@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/kprobes.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 2d28377a0e32..15fba7fe57c8 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -387,10 +387,7 @@ void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
-			if (kp->pre_handler(kp, regs)) {
-				if (WARN_ON_ONCE(1))
-					pr_err("Optprobe ignores instruction pointer changing.(%pF)\n", p->addr);
-			}
+			kp->pre_handler(kp, regs);
 		}
 		reset_kprobe_instance();
 	}
-- 
cgit v1.3-14-g43fede


From c310ce4dcb9df9b2f1be82caff7dae609fe53f72 Mon Sep 17 00:00:00 2001
From: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Date: Sun, 8 Oct 2017 20:55:59 -0700
Subject: timers: Avoid an unnecessary iteration in __run_timers()

If the base clock is behind jiffies in the soft irq expiry code then the
next timer is retrieved by get_next_timer_interrupt() to avoid incrementing
base clock one by one. If the next timer interrupt is past current jiffies
then the base clock is set to jiffies - 1. At the call site this is
incremented and another iteration through the expiry loop is executed which
checks empty hash buckets.

That's a pointless excercise because it's already known that the next timer
is past jiffies.

Set the base clock in that case to jiffies directly so it gets incremented
to jiffies + 1 at the call site resulting in immediate termination of the
expiry loop.

[ tglx: Massaged changelog and added comment to the code ]

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Cc: Joe Jin <joe.jin@oracle.com>
Cc: sboyd@codeaurora.org
Cc: Srinivas Reddy Eeda <srinivas.eeda@oracle.com>
Cc: john.stultz@linaro.org
Link: https://lkml.kernel.org/r/7086a857-f90c-4616-bbe8-f7696f21626c@default
---
 kernel/time/timer.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 38613ced2324..ee1a88d8afb2 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1560,8 +1560,11 @@ static int collect_expired_timers(struct timer_base *base,
 		 * jiffies, otherwise forward to the next expiry time:
 		 */
 		if (time_after(next, jiffies)) {
-			/* The call site will increment clock! */
-			base->clk = jiffies - 1;
+			/*
+			 * The call site will increment base->clk and then
+			 * terminate the expiry loop immediately.
+			 */
+			base->clk = jiffies;
 			return 0;
 		}
 		base->clk = next;
-- 
cgit v1.3-14-g43fede


From 2b5175c4fa974b6aa05bbd2ee8d443a8036a1714 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 17 Oct 2017 09:54:57 +0200
Subject: genirq: Add config option for reservation mode

The interrupt reservation mode requires reactivation of PCI/MSI
interrupts. Create a config option, so the PCI code can set the
corresponding flag when required.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Josh Poulson <jopoulso@microsoft.com>
Cc: Mihai Costache <v-micos@microsoft.com>
Cc: Stephen Hemminger <sthemmin@microsoft.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: linux-pci@vger.kernel.org
Cc: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Simon Xiao <sixiao@microsoft.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Cc: Jork Loeser <Jork.Loeser@microsoft.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: devel@linuxdriverproject.org
Cc: KY Srinivasan <kys@microsoft.com>
Link: https://lkml.kernel.org/r/20171017075600.369375409@linutronix.de
---
 kernel/irq/Kconfig | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index ac1a3e29d3b9..89e355866450 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -100,6 +100,9 @@ config IRQ_TIMINGS
 config GENERIC_IRQ_MATRIX_ALLOCATOR
 	bool
 
+config GENERIC_IRQ_RESERVATION_MODE
+	bool
+
 config IRQ_DOMAIN_DEBUG
 	bool "Expose hardware/virtual IRQ mapping via debugfs"
 	depends on IRQ_DOMAIN && DEBUG_FS
-- 
cgit v1.3-14-g43fede


From 32a6c7233c41216f5dd41fc7bf100eedb1063dfc Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 15:58:25 -0700
Subject: workqueue: Convert timers to use timer_setup() (part 2)

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch to using the new timer_setup() and
from_timer() to pass the timer pointer explicitly. (The prior workqueue
patch missed a few timers.)

Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Link: https://lkml.kernel.org/r/20171016225825.GA99101@beast
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/workqueue.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c77fdf6bf24f..6e5eed58f215 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1831,9 +1831,9 @@ static void destroy_worker(struct worker *worker)
 	wake_up_process(worker->task);
 }
 
-static void idle_worker_timeout(unsigned long __pool)
+static void idle_worker_timeout(struct timer_list *t)
 {
-	struct worker_pool *pool = (void *)__pool;
+	struct worker_pool *pool = from_timer(pool, t, idle_timer);
 
 	spin_lock_irq(&pool->lock);
 
@@ -1879,9 +1879,9 @@ static void send_mayday(struct work_struct *work)
 	}
 }
 
-static void pool_mayday_timeout(unsigned long __pool)
+static void pool_mayday_timeout(struct timer_list *t)
 {
-	struct worker_pool *pool = (void *)__pool;
+	struct worker_pool *pool = from_timer(pool, t, mayday_timer);
 	struct work_struct *work;
 
 	spin_lock_irq(&pool->lock);
@@ -3241,11 +3241,9 @@ static int init_worker_pool(struct worker_pool *pool)
 	INIT_LIST_HEAD(&pool->idle_list);
 	hash_init(pool->busy_hash);
 
-	setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout,
-			       (unsigned long)pool);
+	timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
 
-	setup_timer(&pool->mayday_timer, pool_mayday_timeout,
-		    (unsigned long)pool);
+	timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
 
 	mutex_init(&pool->manager_arb);
 	mutex_init(&pool->attach_mutex);
-- 
cgit v1.3-14-g43fede


From ba16490eac146ebb178017e5de3d61c645552fab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Oct 2017 16:10:19 +0200
Subject: timer: Convert stub timer to timer_setup()

In preparation for unconditionally passing the struct timer_list pointer
to all timer callbacks, switch to using the new timer_setup() and
from_timer() to pass the timer pointer explicitly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Kees Cook <keescook@chromium.org>
---
 kernel/time/timer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index ee1a88d8afb2..fbb1f85327bf 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -610,7 +610,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state)
 }
 
 /* Stub timer callback for improperly used timers. */
-static void stub_timer(unsigned long data)
+static void stub_timer(struct timer_list *unused)
 {
 	WARN_ON(1);
 }
@@ -626,7 +626,7 @@ static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		setup_timer(timer, stub_timer, 0);
+		timer_setup(timer, stub_timer, 0);
 		return true;
 
 	case ODEBUG_STATE_ACTIVE:
@@ -665,7 +665,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		setup_timer(timer, stub_timer, 0);
+		timer_setup(timer, stub_timer, 0);
 		return true;
 	default:
 		return false;
-- 
cgit v1.3-14-g43fede


From 5cdda5117e125e0dbb020425cc55a4c143c6febc Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 18 Oct 2017 17:24:28 +0200
Subject: locking/static_keys: Improve uninitialized key warning

Right now it says:

  static_key_disable_cpuslocked used before call to jump_label_init
  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:161 static_key_disable_cpuslocked+0x68/0x70
  Modules linked in:
  CPU: 0 PID: 0 Comm: swapper Not tainted 4.14.0-rc5+ #1
  Hardware name: SGI.COM C2112-4GP3/X10DRT-P-Series, BIOS 2.0a 05/09/2016
  task: ffffffff81c0e480 task.stack: ffffffff81c00000
  RIP: 0010:static_key_disable_cpuslocked+0x68/0x70
  RSP: 0000:ffffffff81c03ef0 EFLAGS: 00010096 ORIG_RAX: 0000000000000000
  RAX: 0000000000000041 RBX: ffffffff81c32680 RCX: ffffffff81c5cbf8
  RDX: 0000000000000001 RSI: 0000000000000092 RDI: 0000000000000002
  RBP: ffff88807fffd240 R08: 726f666562206465 R09: 0000000000000136
  R10: 0000000000000000 R11: 696e695f6c656261 R12: ffffffff82158900
  R13: ffffffff8215f760 R14: 0000000000000001 R15: 0000000000000008
  FS:  0000000000000000(0000) GS:ffff883f7f400000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: ffff88807ffff000 CR3: 0000000001c09000 CR4: 00000000000606b0
  Call Trace:
   static_key_disable+0x16/0x20
   start_kernel+0x15a/0x45d
   ? load_ucode_intel_bsp+0x11/0x2d
   secondary_startup_64+0xa5/0xb0
  Code: 48 c7 c7 a0 15 cf 81 e9 47 53 4b 00 48 89 df e8 5f fc ff ff eb e8 48 c7 c6 \
	c0 97 83 81 48 c7 c7 d0 ff a2 81 31 c0 e8 c5 9d f5 ff <0f> ff eb a7 0f ff eb \
	b0 e8 eb a2 4b 00 53 48 89 fb e8 42 0e f0

but it doesn't tell me which key it is. So dump the key's name too:

  static_key_disable_cpuslocked(): static key 'virt_spin_lock_key' used before call to jump_label_init()

And that makes pinpointing which key is causing that a lot easier.

 include/linux/jump_label.h           |   14 +++++++-------
 include/linux/jump_label_ratelimit.h |    6 +++---
 kernel/jump_label.c                  |   14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171018152428.ffjgak4o25f7ept6@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h           | 14 +++++++-------
 include/linux/jump_label_ratelimit.h |  6 +++---
 kernel/jump_label.c                  | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index cd5861651b17..979a2f2d529b 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -81,9 +81,9 @@
 
 extern bool static_key_initialized;
 
-#define STATIC_KEY_CHECK_USE() WARN(!static_key_initialized,		      \
-				    "%s used before call to jump_label_init", \
-				    __func__)
+#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,		      \
+				    "%s(): static key '%pS' used before call to jump_label_init()", \
+				    __func__, (key))
 
 #ifdef HAVE_JUMP_LABEL
 
@@ -211,13 +211,13 @@ static __always_inline bool static_key_true(struct static_key *key)
 
 static inline void static_key_slow_inc(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	atomic_inc(&key->enabled);
 }
 
 static inline void static_key_slow_dec(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	atomic_dec(&key->enabled);
 }
 
@@ -236,7 +236,7 @@ static inline int jump_label_apply_nops(struct module *mod)
 
 static inline void static_key_enable(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 0) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -247,7 +247,7 @@ static inline void static_key_enable(struct static_key *key)
 
 static inline void static_key_disable(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 1) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h
index 23da3af459fe..93086df0a847 100644
--- a/include/linux/jump_label_ratelimit.h
+++ b/include/linux/jump_label_ratelimit.h
@@ -24,18 +24,18 @@ struct static_key_deferred {
 };
 static inline void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	static_key_slow_dec(&key->key);
 }
 static inline void static_key_deferred_flush(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 }
 static inline void
 jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 }
 #endif	/* HAVE_JUMP_LABEL */
 #endif	/* _LINUX_JUMP_LABEL_RATELIMIT_H */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0bf2e8f5244a..8ff4ca4665ff 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key)
 {
 	int v, v1;
 
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	/*
 	 * Careful if we get concurrent static_key_slow_inc() calls;
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
 void static_key_enable_cpuslocked(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) > 0) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable);
 
 void static_key_disable_cpuslocked(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 
 	if (atomic_read(&key->enabled) != 1) {
 		WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
@@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
 
 void static_key_slow_dec(struct static_key *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	__static_key_slow_dec(key, 0, NULL);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec);
 
 void static_key_slow_dec_deferred(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	__static_key_slow_dec(&key->key, key->timeout, &key->work);
 }
 EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
 
 void static_key_deferred_flush(struct static_key_deferred *key)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	flush_delayed_work(&key->work);
 }
 EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
 void jump_label_rate_limit(struct static_key_deferred *key,
 		unsigned long rl)
 {
-	STATIC_KEY_CHECK_USE();
+	STATIC_KEY_CHECK_USE(key);
 	key->timeout = rl;
 	INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
 }
-- 
cgit v1.3-14-g43fede


From 93862e385ded7c60351e09fcd2a541d273650905 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:41 -0400
Subject: livepatch: add (un)patch callbacks

Provide livepatch modules a klp_object (un)patching notification
mechanism.  Pre and post-(un)patch callbacks allow livepatch modules to
setup or synchronize changes that would be difficult to support in only
patched-or-unpatched code contexts.

Callbacks can be registered for target module or vmlinux klp_objects,
but each implementation is klp_object specific.

  - Pre-(un)patch callbacks run before any (un)patching transition
    starts.

  - Post-(un)patch callbacks run once an object has been (un)patched and
    the klp_patch fully transitioned to its target state.

Example use cases include modification of global data and registration
of newly available services/handlers.

See Documentation/livepatch/callbacks.txt for details and
samples/livepatch/ for examples.

Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/livepatch/callbacks.txt           | 605 ++++++++++++++++++++++++
 include/linux/livepatch.h                       |  26 +
 kernel/livepatch/core.c                         |  51 +-
 kernel/livepatch/core.h                         |  38 ++
 kernel/livepatch/patch.c                        |   1 +
 kernel/livepatch/transition.c                   |  21 +-
 samples/livepatch/Makefile                      |   3 +
 samples/livepatch/livepatch-callbacks-busymod.c |  72 +++
 samples/livepatch/livepatch-callbacks-demo.c    | 234 +++++++++
 samples/livepatch/livepatch-callbacks-mod.c     |  53 +++
 10 files changed, 1091 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/livepatch/callbacks.txt
 create mode 100644 samples/livepatch/livepatch-callbacks-busymod.c
 create mode 100644 samples/livepatch/livepatch-callbacks-demo.c
 create mode 100644 samples/livepatch/livepatch-callbacks-mod.c

(limited to 'kernel')

diff --git a/Documentation/livepatch/callbacks.txt b/Documentation/livepatch/callbacks.txt
new file mode 100644
index 000000000000..c9776f48e458
--- /dev/null
+++ b/Documentation/livepatch/callbacks.txt
@@ -0,0 +1,605 @@
+======================
+(Un)patching Callbacks
+======================
+
+Livepatch (un)patch-callbacks provide a mechanism for livepatch modules
+to execute callback functions when a kernel object is (un)patched.  They
+can be considered a "power feature" that extends livepatching abilities
+to include:
+
+  - Safe updates to global data
+
+  - "Patches" to init and probe functions
+
+  - Patching otherwise unpatchable code (i.e. assembly)
+
+In most cases, (un)patch callbacks will need to be used in conjunction
+with memory barriers and kernel synchronization primitives, like
+mutexes/spinlocks, or even stop_machine(), to avoid concurrency issues.
+
+Callbacks differ from existing kernel facilities:
+
+  - Module init/exit code doesn't run when disabling and re-enabling a
+    patch.
+
+  - A module notifier can't stop a to-be-patched module from loading.
+
+Callbacks are part of the klp_object structure and their implementation
+is specific to that klp_object.  Other livepatch objects may or may not
+be patched, irrespective of the target klp_object's current state.
+
+Callbacks can be registered for the following livepatch actions:
+
+  * Pre-patch    - before a klp_object is patched
+
+  * Post-patch   - after a klp_object has been patched and is active
+                   across all tasks
+
+  * Pre-unpatch  - before a klp_object is unpatched (ie, patched code is
+                   active), used to clean up post-patch callback
+                   resources
+
+  * Post-unpatch - after a klp_object has been patched, all code has
+                   been restored and no tasks are running patched code,
+                   used to cleanup pre-patch callback resources
+
+Each callback is optional, omitting one does not preclude specifying any
+other.  However, the livepatching core executes the handlers in
+symmetry: pre-patch callbacks have a post-unpatch counterpart and
+post-patch callbacks have a pre-unpatch counterpart.  An unpatch
+callback will only be executed if its corresponding patch callback was
+executed.  Typical use cases pair a patch handler that acquires and
+configures resources with an unpatch handler tears down and releases
+those same resources.
+
+A callback is only executed if its host klp_object is loaded.  For
+in-kernel vmlinux targets, this means that callbacks will always execute
+when a livepatch is enabled/disabled.  For patch target kernel modules,
+callbacks will only execute if the target module is loaded.  When a
+module target is (un)loaded, its callbacks will execute only if the
+livepatch module is enabled.
+
+The pre-patch callback, if specified, is expected to return a status
+code (0 for success, -ERRNO on error).  An error status code indicates
+to the livepatching core that patching of the current klp_object is not
+safe and to stop the current patching request.  (When no pre-patch
+callback is provided, the transition is assumed to be safe.)  If a
+pre-patch callback returns failure, the kernel's module loader will:
+
+  - Refuse to load a livepatch, if the livepatch is loaded after
+    targeted code.
+
+    or:
+
+  - Refuse to load a module, if the livepatch was already successfully
+    loaded.
+
+No post-patch, pre-unpatch, or post-unpatch callbacks will be executed
+for a given klp_object if the object failed to patch, due to a failed
+pre_patch callback or for any other reason.
+
+If a patch transition is reversed, no pre-unpatch handlers will be run
+(this follows the previously mentioned symmetry -- pre-unpatch callbacks
+will only occur if their corresponding post-patch callback executed).
+
+If the object did successfully patch, but the patch transition never
+started for some reason (e.g., if another object failed to patch),
+only the post-unpatch callback will be called.
+
+
+Example Use-cases
+=================
+
+Update global data
+------------------
+
+A pre-patch callback can be useful to update a global variable.  For
+example, 75ff39ccc1bd ("tcp: make challenge acks less predictable")
+changes a global sysctl, as well as patches the tcp_send_challenge_ack()
+function.
+
+In this case, if we're being super paranoid, it might make sense to
+patch the data *after* patching is complete with a post-patch callback,
+so that tcp_send_challenge_ack() could first be changed to read
+sysctl_tcp_challenge_ack_limit with READ_ONCE.
+
+
+Support __init and probe function patches
+-----------------------------------------
+
+Although __init and probe functions are not directly livepatch-able, it
+may be possible to implement similar updates via pre/post-patch
+callbacks.
+
+48900cb6af42 ("virtio-net: drop NETIF_F_FRAGLIST") change the way that
+virtnet_probe() initialized its driver's net_device features.  A
+pre/post-patch callback could iterate over all such devices, making a
+similar change to their hw_features value.  (Client functions of the
+value may need to be updated accordingly.)
+
+
+Test cases
+==========
+
+What follows is not an exhaustive test suite of every possible livepatch
+pre/post-(un)patch combination, but a selection that demonstrates a few
+important concepts.  Each test case uses the kernel modules located in
+the samples/livepatch/ and assumes that no livepatches are loaded at the
+beginning of the test.
+
+
+Test 1
+------
+
+Test a combination of loading a kernel module and a livepatch that
+patches a function in the first module.  (Un)load the target module
+before the livepatch module:
+
+- load target module
+- load livepatch
+- disable livepatch
+- unload target module
+- unload livepatch
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   34.475708] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch enable, before the livepatch transition starts, pre-patch
+callbacks are executed for vmlinux and livepatch_callbacks_mod (those
+klp_objects currently loaded).  After klp_objects are patched according
+to the klp_patch, their post-patch callbacks run and the transition
+completes:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   36.503719] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   36.504213] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   36.504238] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   36.504721] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   36.505849] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   37.727133] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   37.727232] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   37.727860] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   37.728792] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Similarly, on livepatch disable, pre-patch callbacks run before the
+unpatching transition starts.  klp_objects are reverted, post-patch
+callbacks execute and the transition completes:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   38.510209] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   38.510234] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   38.510982] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   38.512209] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   39.711132] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   39.711210] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   39.711779] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   39.712735] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   42.534183] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 2
+------
+
+This test is similar to the previous test, but (un)load the livepatch
+module before the target kernel module.  This tests the livepatch core's
+module_coming handler:
+
+- load livepatch
+- load target module
+- disable livepatch
+- unload livepatch
+- unload target module
+
+
+On livepatch enable, only pre/post-patch callbacks are executed for
+currently loaded klp_objects, in this case, vmlinux:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   44.553328] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   44.553997] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   44.554049] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   44.554845] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   45.727128] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   45.727212] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   45.727961] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted module is subsequently loaded, only its pre/post-patch
+callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   46.560845] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   46.561988] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.563452] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   46.565495] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+On livepatch disable, all currently loaded klp_objects' (vmlinux and
+livepatch_callbacks_mod) pre/post-unpatch callbacks are executed:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   48.568885] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   48.568910] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   48.569441] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   48.570502] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   49.759091] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   49.759171] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   49.759742] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   49.760690] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   52.592283] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 3
+------
+
+Test loading the livepatch after a targeted kernel module, then unload
+the kernel module before disabling the livepatch.  This tests the
+livepatch core's module_going handler:
+
+- load target module
+- load livepatch
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load a target module, then the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   54.607948] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   56.613919] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   56.614411] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   56.614436] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   56.614818] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   56.615656] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   57.759070] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   57.759147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   57.759621] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_LIVE] Normal state
+  [   57.760307] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a target module is unloaded, the livepatch is only reverted from
+that klp_object (livepatch_callbacks_mod).  As such, only its pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   58.623409] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   58.623903] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   58.624658] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   58.625305] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+When the livepatch is disabled, pre and post-unpatch callbacks are run
+for the remaining klp_object, vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   60.638420] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   60.638444] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   60.638996] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   61.727088] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   61.727165] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   61.727985] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 4
+------
+
+This test is similar to the previous test, however the livepatch is
+loaded first.  This tests the livepatch core's module_coming and
+module_going handlers:
+
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+
+First load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   64.661552] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   64.662147] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   64.662175] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   64.662850] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   65.695056] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   65.695147] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   65.695561] livepatch: 'livepatch_callbacks_demo': patching complete
+
+When a targeted kernel module is subsequently loaded, only its
+pre/post-patch callbacks are executed:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   66.669196] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   66.669882] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.670744] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   66.672873] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+When the target module is unloaded, the livepatch is only reverted from
+the livepatch_callbacks_mod klp_object.  As such, only pre and
+post-unpatch callbacks are executed when this occurs:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   68.680065] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [   68.680688] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [   68.681452] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [   68.682094] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   70.689225] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   70.689256] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   70.689882] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   71.711080] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   71.711481] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   71.711988] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 5
+------
+
+A simple test of loading a livepatch without one of its patch target
+klp_objects ever loaded (livepatch_callbacks_mod):
+
+- load livepatch
+- disable livepatch
+- unload livepatch
+
+Load the livepatch:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   74.711081] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   74.711595] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   74.711639] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   74.712272] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   75.743137] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   75.743219] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   75.743867] livepatch: 'livepatch_callbacks_demo': patching complete
+
+As expected, only pre/post-(un)patch handlers are executed for vmlinux:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   76.716254] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   76.716278] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   76.716666] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   77.727089] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   77.727194] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   77.727907] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 6
+------
+
+Test a scenario where a vmlinux pre-patch callback returns a non-zero
+status (ie, failure):
+
+- load target module
+- load livepatch -ENODEV
+- unload target module
+
+First load a target module:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   80.740520] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Load the livepatch module, setting its 'pre_patch_ret' value to -19
+(-ENODEV).  When its vmlinux pre-patch callback executed, this status
+code will propagate back to the module-loading subsystem.  The result is
+that the insmod command refuses to load the livepatch module:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko pre_patch_ret=-19
+  [   82.747326] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   82.747743] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   82.747767] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   82.748237] livepatch: pre-patch callback failed for object 'vmlinux'
+  [   82.748637] livepatch: failed to enable patch 'livepatch_callbacks_demo'
+  [   82.749059] livepatch: 'livepatch_callbacks_demo': canceling transition, going to unpatch
+  [   82.749060] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   82.749868] livepatch: 'livepatch_callbacks_demo': unpatching complete
+  [   82.765809] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-demo.ko: No such device
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   84.774238] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+
+
+Test 7
+------
+
+Similar to the previous test, setup a livepatch such that its vmlinux
+pre-patch callback returns success.  However, when a targeted kernel
+module is later loaded, have the livepatch return a failing status code:
+
+- load livepatch
+- setup -ENODEV
+- load target module
+- disable livepatch
+- unload livepatch
+
+Load the livepatch, notice vmlinux pre-patch callback succeeds:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   86.787845] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   86.788325] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   86.788427] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   86.788821] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   87.711069] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   87.711143] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   87.711886] livepatch: 'livepatch_callbacks_demo': patching complete
+
+Set a trap so subsequent pre-patch callbacks to this livepatch will
+return -ENODEV:
+
+  % echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+
+The livepatch pre-patch callback for subsequently loaded target modules
+will return failure, so the module loader refuses to load the kernel
+module.  Notice that no post-patch or pre/post-unpatch callbacks are
+executed for this klp_object:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [   90.796976] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [   90.797834] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [   90.798900] livepatch: pre-patch callback failed for object 'livepatch_callbacks_mod'
+  [   90.799652] livepatch: patch 'livepatch_callbacks_demo' failed for module 'livepatch_callbacks_mod', refusing to load module 'livepatch_callbacks_mod'
+  [   90.819737] insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+
+However, pre/post-unpatch callbacks run for the vmlinux klp_object:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [   92.823547] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [   92.823573] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [   92.824331] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [   93.727128] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [   93.727327] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [   93.727861] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+
+
+Test 8
+------
+
+Test loading multiple targeted kernel modules.  This test-case is
+mainly for comparing with the next test-case.
+
+- load busy target module (0s sleep),
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load a target "busy" kernel module which kicks off a worker function
+that immediately exits:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=0
+  [   96.910107] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [   96.910600] livepatch_callbacks_busymod: busymod_work_func, sleeping 0 seconds ...
+  [   96.913024] livepatch_callbacks_busymod: busymod_work_func exit
+
+Proceed with loading the livepatch and another ordinary target module,
+notice that the post-patch callbacks are executed and the transition
+completes quickly:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [   98.917892] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [   98.918426] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [   98.918453] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [   98.918955] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   98.923835] livepatch: 'livepatch_callbacks_demo': starting patching transition
+  [   99.743104] livepatch: 'livepatch_callbacks_demo': completing patching transition
+  [   99.743156] livepatch_callbacks_demo: post_patch_callback: vmlinux
+  [   99.743679] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [   99.744616] livepatch: 'livepatch_callbacks_demo': patching complete
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  100.930955] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  100.931668] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.932645] livepatch_callbacks_demo: post_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  100.934125] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  102.942805] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  102.943640] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+  [  102.944585] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  102.945455] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  104.953815] livepatch: 'livepatch_callbacks_demo': initializing unpatching transition
+  [  104.953838] livepatch_callbacks_demo: pre_unpatch_callback: vmlinux
+  [  104.954431] livepatch_callbacks_demo: pre_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  104.955426] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  106.719073] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  106.722633] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  106.723282] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  106.724279] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  108.975660] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
+
+
+Test 9
+------
+
+A similar test as the previous one, but force the "busy" kernel module
+to do longer work.
+
+The livepatching core will refuse to patch a task that is currently
+executing a to-be-patched function -- the consistency model stalls the
+current patch transition until this safety-check is met.  Test a
+scenario where one of a livepatch's target klp_objects sits on such a
+function for a long time.  Meanwhile, load and unload other target
+kernel modules while the livepatch transition is in progress.
+
+- load busy target module (30s sleep)
+- load livepatch
+- load target module
+- unload target module
+- disable livepatch
+- unload livepatch
+- unload busy target module
+
+
+Load the "busy" kernel module, this time make it do 30 seconds worth of
+work:
+
+  % insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+  [  110.993362] livepatch_callbacks_busymod: livepatch_callbacks_mod_init
+  [  110.994059] livepatch_callbacks_busymod: busymod_work_func, sleeping 30 seconds ...
+
+Meanwhile, the livepatch is loaded.  Notice that the patch transition
+does not complete as the targeted "busy" module is sitting on a
+to-be-patched function:
+
+  % insmod samples/livepatch/livepatch-callbacks-demo.ko
+  [  113.000309] livepatch: enabling patch 'livepatch_callbacks_demo'
+  [  113.000764] livepatch: 'livepatch_callbacks_demo': initializing patching transition
+  [  113.000791] livepatch_callbacks_demo: pre_patch_callback: vmlinux
+  [  113.001289] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  113.005208] livepatch: 'livepatch_callbacks_demo': starting patching transition
+
+Load a second target module (this one is an ordinary idle kernel
+module).  Note that *no* post-patch callbacks will be executed while the
+livepatch is still in transition:
+
+  % insmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  115.012740] livepatch: applying patch 'livepatch_callbacks_demo' to loading module 'livepatch_callbacks_mod'
+  [  115.013406] livepatch_callbacks_demo: pre_patch_callback: livepatch_callbacks_mod -> [MODULE_STATE_COMING] Full formed, running module_init
+  [  115.015315] livepatch_callbacks_mod: livepatch_callbacks_mod_init
+
+Request an unload of the simple kernel module.  The patch is still
+transitioning, so its pre-unpatch callbacks are skipped:
+
+  % rmmod samples/livepatch/livepatch-callbacks-mod.ko
+  [  117.022626] livepatch_callbacks_mod: livepatch_callbacks_mod_exit
+  [  117.023376] livepatch: reverting patch 'livepatch_callbacks_demo' on unloading module 'livepatch_callbacks_mod'
+  [  117.024533] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_mod -> [MODULE_STATE_GOING] Going away
+
+Finally the livepatch is disabled.  Since none of the patch's
+klp_object's post-patch callbacks executed, the remaining klp_object's
+pre-unpatch callbacks are skipped:
+
+  % echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+  [  119.035408] livepatch: 'livepatch_callbacks_demo': reversing transition from patching to unpatching
+  [  119.035485] livepatch: 'livepatch_callbacks_demo': starting unpatching transition
+  [  119.711166] livepatch: 'livepatch_callbacks_demo': completing unpatching transition
+  [  119.714179] livepatch_callbacks_demo: post_unpatch_callback: vmlinux
+  [  119.714653] livepatch_callbacks_demo: post_unpatch_callback: livepatch_callbacks_busymod -> [MODULE_STATE_LIVE] Normal state
+  [  119.715437] livepatch: 'livepatch_callbacks_demo': unpatching complete
+
+  % rmmod samples/livepatch/livepatch-callbacks-demo.ko
+  % rmmod samples/livepatch/livepatch-callbacks-busymod.ko
+  [  141.279111] livepatch_callbacks_busymod: busymod_work_func exit
+  [  141.279760] livepatch_callbacks_busymod: livepatch_callbacks_mod_exit
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index d08eddc00497..fc5c1be3f6f4 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -87,10 +87,35 @@ struct klp_func {
 	bool transition;
 };
 
+struct klp_object;
+
+/**
+ * struct klp_callbacks - pre/post live-(un)patch callback structure
+ * @pre_patch:		executed before code patching
+ * @post_patch:		executed after code patching
+ * @pre_unpatch:	executed before code unpatching
+ * @post_unpatch:	executed after code unpatching
+ * @post_unpatch_enabled:	flag indicating if post-unpatch callback
+ * 				should run
+ *
+ * All callbacks are optional.  Only the pre-patch callback, if provided,
+ * will be unconditionally executed.  If the parent klp_object fails to
+ * patch for any reason, including a non-zero error status returned from
+ * the pre-patch callback, no further callbacks will be executed.
+ */
+struct klp_callbacks {
+	int (*pre_patch)(struct klp_object *obj);
+	void (*post_patch)(struct klp_object *obj);
+	void (*pre_unpatch)(struct klp_object *obj);
+	void (*post_unpatch)(struct klp_object *obj);
+	bool post_unpatch_enabled;
+};
+
 /**
  * struct klp_object - kernel object structure for live patching
  * @name:	module name (or NULL for vmlinux)
  * @funcs:	function entries for functions to be patched in the object
+ * @callbacks:	functions to be executed pre/post (un)patching
  * @kobj:	kobject for sysfs resources
  * @mod:	kernel module associated with the patched object
  *		(NULL for vmlinux)
@@ -100,6 +125,7 @@ struct klp_object {
 	/* external */
 	const char *name;
 	struct klp_func *funcs;
+	struct klp_callbacks callbacks;
 
 	/* internal */
 	struct kobject kobj;
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index b9628e43c78f..cafb5a84417d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
 	return obj->name;
 }
 
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
-	return !obj->name || obj->mod;
-}
-
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
@@ -285,6 +280,8 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
+	struct klp_object *obj;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -295,6 +292,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	klp_init_transition(patch, KLP_UNPATCHED);
 
+	klp_for_each_object(patch, obj)
+		if (patch->enabled && obj->patched)
+			klp_pre_unpatch_callback(obj);
+
 	/*
 	 * Enforce the order of the func->transition writes in
 	 * klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +389,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-		ret = klp_patch_object(obj);
+		ret = klp_pre_patch_callback(obj);
 		if (ret) {
-			pr_warn("failed to enable patch '%s'\n",
-				patch->mod->name);
+			pr_warn("pre-patch callback failed for object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
+		}
 
-			klp_cancel_transition();
-			return ret;
+		ret = klp_patch_object(obj);
+		if (ret) {
+			pr_warn("failed to patch object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
 		}
 	}
 
@@ -403,6 +409,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	patch->enabled = true;
 
 	return 0;
+err:
+	pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+	klp_cancel_transition();
+	return ret;
 }
 
 /**
@@ -871,13 +882,27 @@ int klp_module_coming(struct module *mod)
 			pr_notice("applying patch '%s' to loading module '%s'\n",
 				  patch->mod->name, obj->mod->name);
 
+			ret = klp_pre_patch_callback(obj);
+			if (ret) {
+				pr_warn("pre-patch callback failed for object '%s'\n",
+					obj->name);
+				goto err;
+			}
+
 			ret = klp_patch_object(obj);
 			if (ret) {
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
+
+				if (patch != klp_transition_patch)
+					klp_post_unpatch_callback(obj);
+
 				goto err;
 			}
 
+			if (patch != klp_transition_patch)
+				klp_post_patch_callback(obj);
+
 			break;
 		}
 	}
@@ -927,9 +952,15 @@ void klp_module_going(struct module *mod)
 			 * is in transition.
 			 */
 			if (patch->enabled || patch == klp_transition_patch) {
+
+				if (patch != klp_transition_patch)
+					klp_pre_unpatch_callback(obj);
+
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
 				klp_unpatch_object(obj);
+
+				klp_post_unpatch_callback(obj);
 			}
 
 			klp_free_object_loaded(obj);
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index c74f24c47837..6fc907b54e71 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -1,6 +1,44 @@
 #ifndef _LIVEPATCH_CORE_H
 #define _LIVEPATCH_CORE_H
 
+#include <linux/livepatch.h>
+
 extern struct mutex klp_mutex;
 
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+	int ret;
+
+	ret = (obj->callbacks.pre_patch) ?
+		(*obj->callbacks.pre_patch)(obj) : 0;
+
+	obj->callbacks.post_unpatch_enabled = !ret;
+
+	return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_patch)
+		(*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.pre_unpatch)
+		(*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_unpatch_enabled &&
+	    obj->callbacks.post_unpatch)
+		(*obj->callbacks.post_unpatch)(obj);
+}
+
 #endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
+#include "core.h"
 #include "patch.h"
 #include "transition.h"
 
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..7bf55b7f3687 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -109,9 +109,6 @@ static void klp_complete_transition(void)
 		}
 	}
 
-	if (klp_target_state == KLP_UNPATCHED && !immediate_func)
-		module_put(klp_transition_patch->mod);
-
 	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
 	if (klp_target_state == KLP_PATCHED)
 		klp_synchronize_transition();
@@ -130,6 +127,24 @@ static void klp_complete_transition(void)
 	}
 
 done:
+	klp_for_each_object(klp_transition_patch, obj) {
+		if (!klp_is_object_loaded(obj))
+			continue;
+		if (klp_target_state == KLP_PATCHED)
+			klp_post_patch_callback(obj);
+		else if (klp_target_state == KLP_UNPATCHED)
+			klp_post_unpatch_callback(obj);
+	}
+
+	/*
+	 * See complementary comment in __klp_enable_patch() for why we
+	 * keep the module reference for immediate patches.
+	 */
+	if (!klp_transition_patch->immediate && !immediate_func &&
+	    klp_target_state == KLP_UNPATCHED) {
+		module_put(klp_transition_patch->mod);
+	}
+
 	klp_target_state = KLP_UNDEFINED;
 	klp_transition_patch = NULL;
 }
diff --git a/samples/livepatch/Makefile b/samples/livepatch/Makefile
index 539e81d433cd..2472ce39a18d 100644
--- a/samples/livepatch/Makefile
+++ b/samples/livepatch/Makefile
@@ -2,3 +2,6 @@ obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-sample.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-mod.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix1.o
 obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-shadow-fix2.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-demo.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-mod.o
+obj-$(CONFIG_SAMPLE_LIVEPATCH) += livepatch-callbacks-busymod.o
diff --git a/samples/livepatch/livepatch-callbacks-busymod.c b/samples/livepatch/livepatch-callbacks-busymod.c
new file mode 100644
index 000000000000..80d06e103f1b
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-busymod.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-busymod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-mod.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/workqueue.h>
+#include <linux/delay.h>
+
+static int sleep_secs;
+module_param(sleep_secs, int, 0644);
+MODULE_PARM_DESC(sleep_secs, "sleep_secs (default=0)");
+
+static void busymod_work_func(struct work_struct *work);
+static DECLARE_DELAYED_WORK(work, busymod_work_func);
+
+static void busymod_work_func(struct work_struct *work)
+{
+	pr_info("%s, sleeping %d seconds ...\n", __func__, sleep_secs);
+	msleep(sleep_secs * 1000);
+	pr_info("%s exit\n", __func__);
+}
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	schedule_delayed_work(&work,
+		msecs_to_jiffies(1000 * 0));
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	cancel_delayed_work_sync(&work);
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c
new file mode 100644
index 000000000000..3d115bd68442
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-demo.c
@@ -0,0 +1,234 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-demo.c - (un)patching callbacks livepatch demo
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Demonstration of registering livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * Step 1 - load the simple module
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *
+ * Step 2 - load the demonstration livepatch (with callbacks)
+ *
+ *   insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *
+ * Step 3 - cleanup
+ *
+ *   echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *   rmmod livepatch_callbacks_demo
+ *   rmmod livepatch_callbacks_mod
+ *
+ * Watch dmesg output to see livepatch enablement, callback execution
+ * and patching operations for both vmlinux and module targets.
+ *
+ * NOTE: swap the insmod order of livepatch-callbacks-mod.ko and
+ *       livepatch-callbacks-demo.ko to observe what happens when a
+ *       target module is loaded after a livepatch with callbacks.
+ *
+ * NOTE: 'pre_patch_ret' is a module parameter that sets the pre-patch
+ *       callback return status.  Try setting up a non-zero status
+ *       such as -19 (-ENODEV):
+ *
+ *       # Load demo livepatch, vmlinux is patched
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # Setup next pre-patch callback to return -ENODEV
+ *       echo -19 > /sys/module/livepatch_callbacks_demo/parameters/pre_patch_ret
+ *
+ *       # Module loader refuses to load the target module
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       insmod: ERROR: could not insert module samples/livepatch/livepatch-callbacks-mod.ko: No such device
+ *
+ * NOTE: There is a second target module,
+ *       livepatch-callbacks-busymod.ko, available for experimenting
+ *       with livepatch (un)patch callbacks.  This module contains
+ *       a 'sleep_secs' parameter that parks the module on one of the
+ *       functions that the livepatch demo module wants to patch.
+ *       Modifying this value and tweaking the order of module loads can
+ *       effectively demonstrate stalled patch transitions:
+ *
+ *       # Load a target module, let it park on 'busymod_work_func' for
+ *       # thirty seconds
+ *       insmod samples/livepatch/livepatch-callbacks-busymod.ko sleep_secs=30
+ *
+ *       # Meanwhile load the livepatch
+ *       insmod samples/livepatch/livepatch-callbacks-demo.ko
+ *
+ *       # ... then load and unload another target module while the
+ *       # transition is in progress
+ *       insmod samples/livepatch/livepatch-callbacks-mod.ko
+ *       rmmod samples/livepatch/livepatch-callbacks-mod.ko
+ *
+ *       # Finally cleanup
+ *       echo 0 > /sys/kernel/livepatch/livepatch_callbacks_demo/enabled
+ *       rmmod samples/livepatch/livepatch-callbacks-demo.ko
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/livepatch.h>
+
+static int pre_patch_ret;
+module_param(pre_patch_ret, int, 0644);
+MODULE_PARM_DESC(pre_patch_ret, "pre_patch_ret (default=0)");
+
+static const char *const module_state[] = {
+	[MODULE_STATE_LIVE]	= "[MODULE_STATE_LIVE] Normal state",
+	[MODULE_STATE_COMING]	= "[MODULE_STATE_COMING] Full formed, running module_init",
+	[MODULE_STATE_GOING]	= "[MODULE_STATE_GOING] Going away",
+	[MODULE_STATE_UNFORMED]	= "[MODULE_STATE_UNFORMED] Still setting it up",
+};
+
+static void callback_info(const char *callback, struct klp_object *obj)
+{
+	if (obj->mod)
+		pr_info("%s: %s -> %s\n", callback, obj->mod->name,
+			module_state[obj->mod->state]);
+	else
+		pr_info("%s: vmlinux\n", callback);
+}
+
+/* Executed on object patching (ie, patch enablement) */
+static int pre_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+	return pre_patch_ret;
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_patch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void pre_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+/* Executed on object unpatching (ie, patch disablement) */
+static void post_unpatch_callback(struct klp_object *obj)
+{
+	callback_info(__func__, obj);
+}
+
+static void patched_work_func(struct work_struct *work)
+{
+	pr_info("%s\n", __func__);
+}
+
+static struct klp_func no_funcs[] = {
+	{ }
+};
+
+static struct klp_func busymod_funcs[] = {
+	{
+		.old_name = "busymod_work_func",
+		.new_func = patched_work_func,
+	}, { }
+};
+
+static struct klp_object objs[] = {
+	{
+		.name = NULL,	/* vmlinux */
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_mod",
+		.funcs = no_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	},	{
+		.name = "livepatch_callbacks_busymod",
+		.funcs = busymod_funcs,
+		.callbacks = {
+			.pre_patch = pre_patch_callback,
+			.post_patch = post_patch_callback,
+			.pre_unpatch = pre_unpatch_callback,
+			.post_unpatch = post_unpatch_callback,
+		},
+	}, { }
+};
+
+static struct klp_patch patch = {
+	.mod = THIS_MODULE,
+	.objs = objs,
+};
+
+static int livepatch_callbacks_demo_init(void)
+{
+	int ret;
+
+	if (!klp_have_reliable_stack() && !patch.immediate) {
+		/*
+		 * WARNING: Be very careful when using 'patch.immediate' in
+		 * your patches.  It's ok to use it for simple patches like
+		 * this, but for more complex patches which change function
+		 * semantics, locking semantics, or data structures, it may not
+		 * be safe.  Use of this option will also prevent removal of
+		 * the patch.
+		 *
+		 * See Documentation/livepatch/livepatch.txt for more details.
+		 */
+		patch.immediate = true;
+		pr_notice("The consistency model isn't supported for your architecture.  Bypassing safety mechanisms and applying the patch immediately.\n");
+	}
+
+	ret = klp_register_patch(&patch);
+	if (ret)
+		return ret;
+	ret = klp_enable_patch(&patch);
+	if (ret) {
+		WARN_ON(klp_unregister_patch(&patch));
+		return ret;
+	}
+	return 0;
+}
+
+static void livepatch_callbacks_demo_exit(void)
+{
+	WARN_ON(klp_unregister_patch(&patch));
+}
+
+module_init(livepatch_callbacks_demo_init);
+module_exit(livepatch_callbacks_demo_exit);
+MODULE_LICENSE("GPL");
+MODULE_INFO(livepatch, "Y");
diff --git a/samples/livepatch/livepatch-callbacks-mod.c b/samples/livepatch/livepatch-callbacks-mod.c
new file mode 100644
index 000000000000..e610ce29ba44
--- /dev/null
+++ b/samples/livepatch/livepatch-callbacks-mod.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * livepatch-callbacks-mod.c - (un)patching callbacks demo support module
+ *
+ *
+ * Purpose
+ * -------
+ *
+ * Simple module to demonstrate livepatch (un)patching callbacks.
+ *
+ *
+ * Usage
+ * -----
+ *
+ * This module is not intended to be standalone.  See the "Usage"
+ * section of livepatch-callbacks-demo.c.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+static int livepatch_callbacks_mod_init(void)
+{
+	pr_info("%s\n", __func__);
+	return 0;
+}
+
+static void livepatch_callbacks_mod_exit(void)
+{
+	pr_info("%s\n", __func__);
+}
+
+module_init(livepatch_callbacks_mod_init);
+module_exit(livepatch_callbacks_mod_exit);
+MODULE_LICENSE("GPL");
-- 
cgit v1.3-14-g43fede


From 6116c3033a761611b1da980ea664c6ddff3eaed6 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:42 -0400
Subject: livepatch: move transition "complete" notice into
 klp_complete_transition()

klp_complete_transition() performs a bit of housework before a
transition to KLP_PATCHED or KLP_UNPATCHED is actually completed
(including post-(un)patch callbacks).  To be consistent, move the
transition "complete" kernel log notice out of
klp_try_complete_transition() and into klp_complete_transition().

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Reviewed-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/transition.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 7bf55b7f3687..53887f0bca10 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -136,6 +136,9 @@ done:
 			klp_post_unpatch_callback(obj);
 	}
 
+	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	/*
 	 * See complementary comment in __klp_enable_patch() for why we
 	 * keep the module reference for immediate patches.
@@ -423,9 +426,6 @@ void klp_try_complete_transition(void)
 	}
 
 success:
-	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
-		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
 	/* we're done, now cleanup the data structures */
 	klp_complete_transition();
 }
-- 
cgit v1.3-14-g43fede


From af026796054fb70439e919a925615e61b500ef6b Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 13 Oct 2017 15:08:43 -0400
Subject: livepatch: add transition notices

Log a few kernel debug messages at the beginning of the following livepatch
transition functions:

  klp_complete_transition()
  klp_cancel_transition()
  klp_init_transition()
  klp_reverse_transition()

Also update the log notice message in klp_start_transition() for similar
verbiage as the above messages.

Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Acked-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/transition.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 53887f0bca10..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
 	unsigned int cpu;
 	bool immediate_func = false;
 
+	pr_debug("'%s': completing %s transition\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	if (klp_target_state == KLP_UNPATCHED) {
 		/*
 		 * All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -163,6 +167,9 @@ void klp_cancel_transition(void)
 	if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
 		return;
 
+	pr_debug("'%s': canceling patching transition, going to unpatch\n",
+		 klp_transition_patch->mod->name);
+
 	klp_target_state = KLP_UNPATCHED;
 	klp_complete_transition();
 }
@@ -441,7 +448,8 @@ void klp_start_transition(void)
 
 	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
 
-	pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+	pr_notice("'%s': starting %s transition\n",
+		  klp_transition_patch->mod->name,
 		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
 
 	/*
@@ -497,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 */
 	klp_target_state = state;
 
+	pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	/*
 	 * If the patch can be applied or reverted immediately, skip the
 	 * per-task transitions.
@@ -562,6 +573,11 @@ void klp_reverse_transition(void)
 	unsigned int cpu;
 	struct task_struct *g, *task;
 
+	pr_debug("'%s': reversing transition from %s\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+						   "unpatching to patching");
+
 	klp_transition_patch->enabled = !klp_transition_patch->enabled;
 
 	klp_target_state = !klp_target_state;
-- 
cgit v1.3-14-g43fede


From f1d783585486c7c612f277c2a6f0c9bb5a67e463 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 5 Oct 2017 10:44:54 +0900
Subject: irqdomain: Move revmap_trees_mutex to struct irq_domain

The revmap_trees_mutex protects domain->revmap_tree.  There is no
need to make it global because it is allowed to modify revmap_tree
of two different domains concurrently.  Having said that, this would
not be a actual bottleneck because the interrupt map/unmap does not
occur quite often.  Rather, the motivation is to tidy up the code
from a data structure point of view.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irqdomain.h |  2 ++
 kernel/irq/irqdomain.c    | 14 +++++++-------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 7d0c6c144708..df162f7a4aad 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -32,6 +32,7 @@
 #include <linux/types.h>
 #include <linux/irqhandler.h>
 #include <linux/of.h>
+#include <linux/mutex.h>
 #include <linux/radix-tree.h>
 
 struct device_node;
@@ -176,6 +177,7 @@ struct irq_domain {
 	unsigned int revmap_direct_max_irq;
 	unsigned int revmap_size;
 	struct radix_tree_root revmap_tree;
+	struct mutex revmap_tree_mutex;
 	unsigned int linear_revmap[];
 };
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index b50f737574ae..31d0f9ff4f00 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -21,7 +21,6 @@
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
-static DEFINE_MUTEX(revmap_trees_mutex);
 static struct irq_domain *irq_default_domain;
 
 static void irq_domain_check_hierarchy(struct irq_domain *domain);
@@ -211,6 +210,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
 
 	/* Fill structure */
 	INIT_RADIX_TREE(&domain->revmap_tree, GFP_KERNEL);
+	mutex_init(&domain->revmap_tree_mutex);
 	domain->ops = ops;
 	domain->host_data = host_data;
 	domain->hwirq_max = hwirq_max;
@@ -462,9 +462,9 @@ static void irq_domain_clear_mapping(struct irq_domain *domain,
 	if (hwirq < domain->revmap_size) {
 		domain->linear_revmap[hwirq] = 0;
 	} else {
-		mutex_lock(&revmap_trees_mutex);
+		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_delete(&domain->revmap_tree, hwirq);
-		mutex_unlock(&revmap_trees_mutex);
+		mutex_unlock(&domain->revmap_tree_mutex);
 	}
 }
 
@@ -475,9 +475,9 @@ static void irq_domain_set_mapping(struct irq_domain *domain,
 	if (hwirq < domain->revmap_size) {
 		domain->linear_revmap[hwirq] = irq_data->irq;
 	} else {
-		mutex_lock(&revmap_trees_mutex);
+		mutex_lock(&domain->revmap_tree_mutex);
 		radix_tree_insert(&domain->revmap_tree, hwirq, irq_data);
-		mutex_unlock(&revmap_trees_mutex);
+		mutex_unlock(&domain->revmap_tree_mutex);
 	}
 }
 
@@ -1459,11 +1459,11 @@ static void irq_domain_fix_revmap(struct irq_data *d)
 		return; /* Not using radix tree. */
 
 	/* Fix up the revmap. */
-	mutex_lock(&revmap_trees_mutex);
+	mutex_lock(&d->domain->revmap_tree_mutex);
 	slot = radix_tree_lookup_slot(&d->domain->revmap_tree, d->hwirq);
 	if (slot)
 		radix_tree_replace_slot(&d->domain->revmap_tree, slot, d);
-	mutex_unlock(&revmap_trees_mutex);
+	mutex_unlock(&d->domain->revmap_tree_mutex);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From d03cc2d8aed3bfbdfc1f52e79054e0013b5721ca Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Fri, 22 Sep 2017 21:20:41 +0900
Subject: irqdomain: Add __rcu annotations to radix tree slot

Fix different address spaces warning of sparse.

kernel/irq/irqdomain.c:1463:14: warning: incorrect type in assignment (different address spaces)
kernel/irq/irqdomain.c:1463:14:    expected void **slot
kernel/irq/irqdomain.c:1463:14:    got void [noderef] <asn:4>**
kernel/irq/irqdomain.c:1465:66: warning: incorrect type in argument 2 (different address spaces)
kernel/irq/irqdomain.c:1465:66:    expected void [noderef] <asn:4>**slot
kernel/irq/irqdomain.c:1465:66:    got void **slot

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/irqdomain.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 31d0f9ff4f00..fbbf34293b17 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
 	struct irq_desc *desc;
 	struct irq_domain *domain;
 	struct radix_tree_iter iter;
-	void **slot;
+	void __rcu **slot;
 	int i;
 
 	seq_printf(m, " %-16s  %-6s  %-10s  %-10s  %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
 /* The irq_data was moved, fix the revmap to refer to the new location */
 static void irq_domain_fix_revmap(struct irq_data *d)
 {
-	void **slot;
+	void __rcu **slot;
 
 	if (d->hwirq < d->domain->revmap_size)
 		return; /* Not using radix tree. */
-- 
cgit v1.3-14-g43fede


From 9ad0457423af877ad1b76c105a57130da028ccad Mon Sep 17 00:00:00 2001
From: Markus Elfring <elfring@users.sourceforge.net>
Date: Fri, 6 Oct 2017 16:27:26 +0200
Subject: kernel/module: Delete an error message for a failed memory allocation
 in add_module_usage()

Omit an extra message for a memory allocation failure in this function.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Jessica Yu <jeyu@kernel.org>
---
 kernel/module.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..07ef44767245 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -837,10 +837,8 @@ static int add_module_usage(struct module *a, struct module *b)
 
 	pr_debug("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use) {
-		pr_warn("%s: out of memory loading\n", a->name);
+	if (!use)
 		return -ENOMEM;
-	}
 
 	use->source = a;
 	use->target = b;
-- 
cgit v1.3-14-g43fede


From 3a29ddb1c5986a6d3f941bfb1f434105203ce7f6 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Thu, 19 Oct 2017 15:17:23 +0100
Subject: clockevents: Retry programming min delta up to 10 times

When CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=n, the call path
hrtimer_reprogram -> clockevents_program_event ->
clockevents_program_min_delta will not retry if the clock event driver
returns -ETIME.

If the driver could not satisfy the program_min_delta for any reason, the
lack of a retry means the CPU may not receive a tick interrupt, potentially
until the counter does a full period. This leads to rcu_sched timeout
messages as the stalled CPU is detected by other CPUs, and other issues if
the CPU is holding locks or other resources at the point at which it
stalls.

There have been a couple of observed mechanisms through which a clock event
driver could not satisfy the requested min_delta and return -ETIME.

With the MIPS GIC driver, the shared execution resource within MT cores
means inconventient latency due to execution of instructions from other
hardware threads in the core, within gic_next_event, can result in an event
being set in the past.

Additionally under virtualisation it is possible to get unexpected latency
during a clockevent device's set_next_event() callback which can make it
return -ETIME even for a delta based on min_delta_ns.

It isn't appropriate to use MIN_ADJUST in the virtualisation case as
occasional hypervisor induced high latency will cause min_delta_ns to
quickly increase to the maximum.

Instead, borrow the retry pattern from the MIN_ADJUST case, but without
making adjustments. Retry up to 10 times, each time increasing the
attempted delta by min_delta, before giving up.

[ Matt: Reworked the loop and made retry increase the delta. ]

Signed-off-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Matt Redfearn <matt.redfearn@mips.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mips@linux-mips.org
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: "Martin Schwidefsky" <schwidefsky@de.ibm.com>
Cc: James Hogan <james.hogan@mips.com>
Link: https://lkml.kernel.org/r/1508422643-6075-1-git-send-email-matt.redfearn@mips.com
---
 kernel/time/clockevents.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 4237e0744e26..16c027e9cc73 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -280,17 +280,22 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 static int clockevents_program_min_delta(struct clock_event_device *dev)
 {
 	unsigned long long clc;
-	int64_t delta;
+	int64_t delta = 0;
+	int i;
 
-	delta = dev->min_delta_ns;
-	dev->next_event = ktime_add_ns(ktime_get(), delta);
+	for (i = 0; i < 10; i++) {
+		delta += dev->min_delta_ns;
+		dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-	if (clockevent_state_shutdown(dev))
-		return 0;
+		if (clockevent_state_shutdown(dev))
+			return 0;
 
-	dev->retries++;
-	clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
-	return dev->set_next_event((unsigned long) clc, dev);
+		dev->retries++;
+		clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+		if (dev->set_next_event((unsigned long) clc, dev) == 0)
+			return 0;
+	}
+	return -ETIME;
 }
 
 #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
-- 
cgit v1.3-14-g43fede


From b88697810d7c1d102a529990f9071b0f14cfe6df Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Wed, 18 Oct 2017 08:33:44 -0700
Subject: rcu: Do not include rtmutex_common.h unconditionally

This commit adjusts include files and provides definitions in preparation
for suppressing lockdep false-positive ->boost_mtx complaints.  Without
this preparation, architectures not supporting rt_mutex will get build
failures.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index fed95fa941e6..969eae45f05d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -54,6 +54,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
  * This probably needs to be excluded from -rt builds.
  */
 #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
 
 #endif /* #else #ifdef CONFIG_RCU_BOOST */
 
@@ -911,8 +912,6 @@ void exit_rcu(void)
 
 #ifdef CONFIG_RCU_BOOST
 
-#include "../locking/rtmutex_common.h"
-
 static void rcu_wake_cond(struct task_struct *t, int status)
 {
 	/*
-- 
cgit v1.3-14-g43fede


From 02a7c234e54052101164368ff981bd72f7acdd65 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 19 Sep 2017 15:36:42 -0700
Subject: rcu: Suppress lockdep false-positive ->boost_mtx complaints

RCU priority boosting uses rt_mutex_init_proxy_locked() to initialize an
rt_mutex structure in locked state held by some other task.  When that
other task releases it, lockdep complains (quite accurately, but a bit
uselessly) that the other task never acquired it.  This complaint can
suppress other, more helpful, lockdep complaints, and in any case it is
a false positive.

This commit therefore switches from rt_mutex_unlock() to
rt_mutex_futex_unlock(), thereby avoiding the lockdep annotations.
Of course, if lockdep ever learns about rt_mutex_init_proxy_locked(),
addtional adjustments will be required.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 969eae45f05d..1eaab96d1a3c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -531,7 +531,7 @@ void rcu_read_unlock_special(struct task_struct *t)
 
 		/* Unboost if we were boosted. */
 		if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
-			rt_mutex_unlock(&rnp->boost_mtx);
+			rt_mutex_futex_unlock(&rnp->boost_mtx);
 
 		/*
 		 * If this was the last task on the expedited lists,
-- 
cgit v1.3-14-g43fede


From c0da313e090d6e7130d0c3005245176296c24e4a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 22 Sep 2017 09:58:47 -0700
Subject: rcu: Add extended-quiescent-state testing advice

If you add or remove calls to rcu_idle_enter(), rcu_user_enter(),
rcu_irq_exit(), rcu_irq_exit_irqson(), rcu_idle_exit(), rcu_user_exit(),
rcu_irq_enter(), rcu_irq_enter_irqson(), rcu_nmi_enter(), or
rcu_nmi_exit(), you should run a full set of tests on a kernel built
with CONFIG_RCU_EQS_DEBUG=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b0ad62b0e7b8..d234356d7afc 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -837,6 +837,9 @@ static void rcu_eqs_enter(bool user)
  * We crowbar the ->dynticks_nesting field to zero to allow for
  * the possibility of usermode upcalls having messed up our count
  * of interrupt nesting level during the prior busy period.
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_idle_enter(void)
 {
@@ -852,6 +855,9 @@ void rcu_idle_enter(void)
  * is permitted between this call and rcu_user_exit(). This way the
  * CPU doesn't need to maintain the tick for RCU maintenance purposes
  * when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_user_enter(void)
 {
@@ -875,6 +881,9 @@ void rcu_user_enter(void)
  * Use things like work queues to work around this limitation.
  *
  * You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_irq_exit(void)
 {
@@ -899,6 +908,9 @@ void rcu_irq_exit(void)
 
 /*
  * Wrapper for rcu_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_irq_exit_irqson(void)
 {
@@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user)
  * allow for the possibility of usermode upcalls messing up our count
  * of interrupt nesting level during the busy period that is just
  * now starting.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_idle_exit(void)
 {
@@ -987,6 +1002,9 @@ void rcu_idle_exit(void)
  *
  * Exit RCU idle mode while entering the kernel because it can
  * run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_user_exit(void)
 {
@@ -1012,6 +1030,9 @@ void rcu_user_exit(void)
  * Use things like work queues to work around this limitation.
  *
  * You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_irq_enter(void)
 {
@@ -1037,6 +1058,9 @@ void rcu_irq_enter(void)
 
 /*
  * Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_irq_enter_irqson(void)
 {
@@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void)
  * that the CPU is active.  This implementation permits nested NMIs, as
  * long as the nesting level does not overflow an int.  (You will probably
  * run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_nmi_enter(void)
 {
@@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void)
  * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
  * to let the RCU grace-period handling know that the CPU is back to
  * being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
  */
 void rcu_nmi_exit(void)
 {
-- 
cgit v1.3-14-g43fede


From 56628a7fc84a6e3c4c84886d70ec26da3d7f2ce4 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 22 Sep 2017 17:28:06 +0200
Subject: rcu/segcblist: Include rcupdate.h

The RT build on ARM complains about non-existing ULONG_CMP_LT.
This commit therefore includes rcupdate.h into rcu_segcblist.c.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu_segcblist.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 7649fcd2c4c7..88cba7c2956c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -23,6 +23,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/rcupdate.h>
 
 #include "rcu_segcblist.h"
 
-- 
cgit v1.3-14-g43fede


From a30b85df7d599f626973e9cd3056fe755bd778e0 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 20 Oct 2017 08:43:39 +0900
Subject: kprobes: Use synchronize_rcu_tasks() for optprobe with
 CONFIG_PREEMPT=y

We want to wait for all potentially preempted kprobes trampoline
execution to have completed. This guarantees that any freed
trampoline memory is not in use by any task in the system anymore.
synchronize_rcu_tasks() gives such a guarantee, so use it.

Also, this guarantees to wait for all potentially preempted tasks
on the instructions which will be replaced with a jump.

Since this becomes a problem only when CONFIG_PREEMPT=y, enable
CONFIG_TASKS_RCU=y for synchronize_rcu_tasks() in that case.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Naveen N . Rao <naveen.n.rao@linux.vnet.ibm.com>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/150845661962.5443.17724352636247312231.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/Kconfig     |  2 +-
 kernel/kprobes.c | 14 ++++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 1aafb4efbb51..f75c8e8a229b 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -90,7 +90,7 @@ config STATIC_KEYS_SELFTEST
 config OPTPROBES
 	def_bool y
 	depends on KPROBES && HAVE_OPTPROBES
-	depends on !PREEMPT
+	select TASKS_RCU if PREEMPT
 
 config KPROBES_ON_FTRACE
 	def_bool y
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 15fba7fe57c8..a8fc1492b308 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work)
 	do_unoptimize_kprobes();
 
 	/*
-	 * Step 2: Wait for quiesence period to ensure all running interrupts
-	 * are done. Because optprobe may modify multiple instructions
-	 * there is a chance that Nth instruction is interrupted. In that
-	 * case, running interrupt can return to 2nd-Nth byte of jump
-	 * instruction. This wait is for avoiding it.
+	 * Step 2: Wait for quiesence period to ensure all potentially
+	 * preempted tasks to have normally scheduled. Because optprobe
+	 * may modify multiple instructions, there is a chance that Nth
+	 * instruction is preempted. In that case, such tasks can return
+	 * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+	 * Note that on non-preemptive kernel, this is transparently converted
+	 * to synchronoze_sched() to wait for all interrupts to have completed.
 	 */
-	synchronize_sched();
+	synchronize_rcu_tasks();
 
 	/* Step 3: Optimize kprobes after quiesence period */
 	do_optimize_kprobes();
-- 
cgit v1.3-14-g43fede


From 590c845930457d25d27dc1fdd964a1ce18ef2d7d Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 6 Oct 2017 08:14:37 +0900
Subject: kprobes: Disable the jprobes APIs

Disable the jprobes APIs and comment out the jprobes API function
code. This is in preparation of removing all jprobes related
code (including kprobe's break_handler).

Nowadays ftrace and other tracing features are mature enough
to replace jprobes use-cases. Users can safely use ftrace and
perf probe etc. for their use cases.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Ian McDonald <ian.mcdonald@jandi.co.nz>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Link: http://lkml.kernel.org/r/150724527741.5014.15465541485637899227.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kprobes.h | 40 ++++++++++++++++++----------------------
 kernel/kprobes.c        |  2 ++
 2 files changed, 20 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index bd2684700b74..56b2e698dbad 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -391,10 +391,6 @@ int register_kprobes(struct kprobe **kps, int num);
 void unregister_kprobes(struct kprobe **kps, int num);
 int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
 int longjmp_break_handler(struct kprobe *, struct pt_regs *);
-int register_jprobe(struct jprobe *p);
-void unregister_jprobe(struct jprobe *p);
-int register_jprobes(struct jprobe **jps, int num);
-void unregister_jprobes(struct jprobe **jps, int num);
 void jprobe_return(void);
 unsigned long arch_deref_entry_point(void *);
 
@@ -443,20 +439,6 @@ static inline void unregister_kprobe(struct kprobe *p)
 static inline void unregister_kprobes(struct kprobe **kps, int num)
 {
 }
-static inline int register_jprobe(struct jprobe *p)
-{
-	return -ENOSYS;
-}
-static inline int register_jprobes(struct jprobe **jps, int num)
-{
-	return -ENOSYS;
-}
-static inline void unregister_jprobe(struct jprobe *p)
-{
-}
-static inline void unregister_jprobes(struct jprobe **jps, int num)
-{
-}
 static inline void jprobe_return(void)
 {
 }
@@ -486,6 +468,20 @@ static inline int enable_kprobe(struct kprobe *kp)
 	return -ENOSYS;
 }
 #endif /* CONFIG_KPROBES */
+static inline int __deprecated register_jprobe(struct jprobe *p)
+{
+	return -ENOSYS;
+}
+static inline int __deprecated register_jprobes(struct jprobe **jps, int num)
+{
+	return -ENOSYS;
+}
+static inline void __deprecated unregister_jprobe(struct jprobe *p)
+{
+}
+static inline void __deprecated unregister_jprobes(struct jprobe **jps, int num)
+{
+}
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
 	return disable_kprobe(&rp->kp);
@@ -494,13 +490,13 @@ static inline int enable_kretprobe(struct kretprobe *rp)
 {
 	return enable_kprobe(&rp->kp);
 }
-static inline int disable_jprobe(struct jprobe *jp)
+static inline int __deprecated disable_jprobe(struct jprobe *jp)
 {
-	return disable_kprobe(&jp->kp);
+	return -ENOSYS;
 }
-static inline int enable_jprobe(struct jprobe *jp)
+static inline int __deprecated enable_jprobe(struct jprobe *jp)
 {
-	return enable_kprobe(&jp->kp);
+	return -ENOSYS;
 }
 
 #ifndef CONFIG_KPROBES
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a8fc1492b308..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
 	return (unsigned long)entry;
 }
 
+#if 0
 int register_jprobes(struct jprobe **jps, int num)
 {
 	int ret = 0, i;
@@ -1839,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
 	}
 }
 EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
 
 #ifdef CONFIG_KRETPROBES
 /*
-- 
cgit v1.3-14-g43fede


From 2c7d662e2647aa55fa56dc449b3878ac24e17adf Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 6 Oct 2017 08:15:17 +0900
Subject: kprobes: Disable the jprobes test code

Disable jprobes test code because jprobes are deprecated.
This code will be completely removed when the jprobe code
is removed.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Ian McDonald <ian.mcdonald@jandi.co.nz>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E . McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Link: http://lkml.kernel.org/r/150724531730.5014.6377596890962355763.stgit@devbox
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/test_kprobes.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 47106a1e645a..dd53e354f630 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,7 +22,7 @@
 
 #define div_factor 3
 
-static u32 rand1, preh_val, posth_val, jph_val;
+static u32 rand1, preh_val, posth_val;
 static int errors, handler_errors, num_tests;
 static u32 (*target)(u32 value);
 static u32 (*target2)(u32 value);
@@ -162,6 +162,9 @@ static int test_kprobes(void)
 
 }
 
+#if 0
+static u32 jph_val;
+
 static u32 j_kprobe_target(u32 value)
 {
 	if (preemptible()) {
@@ -239,6 +242,10 @@ static int test_jprobes(void)
 
 	return 0;
 }
+#else
+#define test_jprobe() (0)
+#define test_jprobes() (0)
+#endif
 #ifdef CONFIG_KRETPROBES
 static u32 krph_val;
 
-- 
cgit v1.3-14-g43fede


From 6e71b04a82248ccf13a94b85cbc674a9fefe53f5 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:22 -0700
Subject: bpf: Add file mode configuration into bpf maps

Introduce the map read/write flags to the eBPF syscalls that returns the
map fd. The flags is used to set up the file mode when construct a new
file descriptor for bpf maps. To not break the backward capability, the
f_flags is set to O_RDWR if the flag passed by syscall is 0. Otherwise
it should be O_RDONLY or O_WRONLY. When the userspace want to modify or
read the map content, it will check the file mode to see if it is
allowed to make the change.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  8 +++--
 include/uapi/linux/bpf.h |  6 ++++
 kernel/bpf/arraymap.c    |  6 +++-
 kernel/bpf/devmap.c      |  5 ++-
 kernel/bpf/hashtab.c     |  5 +--
 kernel/bpf/inode.c       | 15 ++++++---
 kernel/bpf/lpm_trie.c    |  3 +-
 kernel/bpf/sockmap.c     |  5 ++-
 kernel/bpf/stackmap.c    |  5 ++-
 kernel/bpf/syscall.c     | 88 ++++++++++++++++++++++++++++++++++++++++++------
 net/netfilter/xt_bpf.c   |  2 +-
 11 files changed, 122 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index d67ccdc0099f..3e5508f2fa87 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -315,11 +315,11 @@ void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
 int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
 
 int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -338,6 +338,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
 				void *key, void *value, u64 map_flags);
 int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
 
+int bpf_get_file_flag(int flags);
+
 /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
  * forced to use 'long' read/writes to try to atomically copy long counters.
  * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -421,7 +423,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;
 }
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4303fb6c3817..d83f95ea6a1b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -218,6 +218,10 @@ enum bpf_attach_type {
 
 #define BPF_OBJ_NAME_LEN 16U
 
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY		(1U << 3)
+#define BPF_F_WRONLY		(1U << 4)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
@@ -260,6 +264,7 @@ union bpf_attr {
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
 		__aligned_u64	pathname;
 		__u32		bpf_fd;
+		__u32		file_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -287,6 +292,7 @@ union bpf_attr {
 			__u32		map_id;
 		};
 		__u32		next_id;
+		__u32		open_flags;
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 68d866628be0..988c04c91e10 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e093d9a2c4dd..e5d3de7cff2e 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -80,7 +83,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 431126f31ea3..919955236e63 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 34d8a690ea05..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -495,7 +495,8 @@ out:
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index a298d6666698..86ec846f2d5e 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -40,6 +40,9 @@
 #include <linux/list.h>
 #include <net/strparser.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -489,7 +492,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e893cac6795..676a06e6b322 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -34,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -294,17 +296,48 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
 static const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -344,12 +377,17 @@ static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -375,7 +413,7 @@ static int map_create(union bpf_attr *attr)
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -490,6 +528,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -570,6 +613,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -659,6 +707,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -702,6 +755,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -908,6 +966,8 @@ static const struct file_operations bpf_prog_fops = {
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
@@ -1117,11 +1177,11 @@ free_prog_nouncharge:
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1129,10 +1189,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1392,20 +1454,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1417,7 +1485,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c
index 29123934887b..041da0d9c06f 100644
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
 	int retval, fd;
 
 	set_fs(KERNEL_DS);
-	fd = bpf_obj_get_user(path);
+	fd = bpf_obj_get_user(path, 0);
 	set_fs(oldfs);
 	if (fd < 0)
 		return fd;
-- 
cgit v1.3-14-g43fede


From afdb09c720b62b8090584c11151d856df330e57d Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:24 -0700
Subject: security: bpf: Add LSM hooks for bpf object related syscall

Introduce several LSM hooks for the syscalls that will allow the
userspace to access to eBPF object such as eBPF programs and eBPF maps.
The security check is aimed to enforce a per object security protection
for eBPF object so only processes with the right priviliges can
read/write to a specific map or use a specific eBPF program. Besides
that, a general security hook is added before the multiplexer of bpf
syscall to check the cmd and the attribute used for the command. The
actual security module can decide which command need to be checked and
how the cmd should be checked.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h       |  6 ++++++
 include/linux/lsm_hooks.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/security.h  | 45 +++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c      | 34 +++++++++++++++++++++++++++--
 security/security.c       | 32 ++++++++++++++++++++++++++++
 5 files changed, 169 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 3e5508f2fa87..84c192da3e0b 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -57,6 +57,9 @@ struct bpf_map {
 	atomic_t usercnt;
 	struct bpf_map *inner_map_meta;
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 };
 
 /* function argument constraints */
@@ -193,6 +196,9 @@ struct bpf_prog_aux {
 	struct user_struct *user;
 	u64 load_time; /* ns since boottime */
 	char name[BPF_OBJ_NAME_LEN];
+#ifdef CONFIG_SECURITY
+	void *security;
+#endif
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h
index c9258124e417..7161d8e7ee79 100644
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
  *	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
  *	@ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *	Do a initial check for all bpf syscalls after the attribute is copied
+ *	into the kernel. The actual security module can implement their own
+ *	rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF maps.
+ *
+ *	@map: bpf map that we want to access
+ *	@mask: the access flags
+ *
+ * @bpf_prog:
+ *	Do a check when the kernel generate and return a file descriptor for
+ *	eBPF programs.
+ *
+ *	@prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *	Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *	Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *	Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *	Clean up the security information stored inside bpf prog.
+ *
  */
 union security_list_options {
 	int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
 				struct audit_context *actx);
 	void (*audit_rule_free)(void *lsmrule);
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+	int (*bpf)(int cmd, union bpf_attr *attr,
+				 unsigned int size);
+	int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+	int (*bpf_prog)(struct bpf_prog *prog);
+	int (*bpf_map_alloc_security)(struct bpf_map *map);
+	void (*bpf_map_free_security)(struct bpf_map *map);
+	int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+	void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
 };
 
 struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
 	struct list_head audit_rule_match;
 	struct list_head audit_rule_free;
 #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+	struct list_head bpf;
+	struct list_head bpf_map;
+	struct list_head bpf_prog;
+	struct list_head bpf_map_alloc_security;
+	struct list_head bpf_map_free_security;
+	struct list_head bpf_prog_alloc_security;
+	struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
 } __randomize_layout;
 
 /*
diff --git a/include/linux/security.h b/include/linux/security.h
index ce6265960d6c..18800b0911e5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -31,6 +31,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/bpf.h>
 
 struct linux_binprm;
 struct cred;
@@ -1730,6 +1731,50 @@ static inline void securityfs_remove(struct dentry *dentry)
 
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+					     unsigned int size)
+{
+	return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+	return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
 #ifdef CONFIG_SECURITY
 
 static inline char *alloc_secdata(void)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 676a06e6b322..5cb56d06b48d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -212,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
+	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -325,6 +326,12 @@ static const struct file_operations bpf_map_fops = {
 
 int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
+	int ret;
+
+	ret = security_bpf_map(map, OPEN_FMODE(flags));
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
 				flags | O_CLOEXEC);
 }
@@ -405,10 +412,14 @@ static int map_create(union bpf_attr *attr)
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	err = bpf_map_charge_memlock(map);
+	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map_sec;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map;
@@ -430,6 +441,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map:
 	bpf_map_uncharge_memlock(map);
+free_map_sec:
+	security_bpf_map_free(map);
 free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
@@ -914,6 +927,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -972,6 +986,12 @@ static const struct file_operations bpf_prog_fops = {
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
 {
+	int ret;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
 				O_RDWR | O_CLOEXEC);
 }
@@ -1111,10 +1131,14 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
-	err = bpf_prog_charge_memlock(prog);
+	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_sec;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -1172,6 +1196,8 @@ free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
 	bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+	security_bpf_prog_free(prog->aux);
 free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
@@ -1640,6 +1666,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (copy_from_user(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	err = security_bpf(cmd, &attr, size);
+	if (err < 0)
+		return err;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
diff --git a/security/security.c b/security/security.c
index 4bf0f571b4ef..1cd8526cb0b7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -12,6 +12,7 @@
  *	(at your option) any later version.
  */
 
+#include <linux/bpf.h>
 #include <linux/capability.h>
 #include <linux/dcache.h>
 #include <linux/module.h>
@@ -1703,3 +1704,34 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
 				actx);
 }
 #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+	return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+	return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+	return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+	return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+	return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+	call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+	call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
-- 
cgit v1.3-14-g43fede


From f66e448cfda021b0bcd884f26709796fe19c7cc1 Mon Sep 17 00:00:00 2001
From: Chenbo Feng <fengc@google.com>
Date: Wed, 18 Oct 2017 13:00:26 -0700
Subject: selinux: bpf: Add addtional check for bpf object file receive

Introduce a bpf object related check when sending and receiving files
through unix domain socket as well as binder. It checks if the receiving
process have privilege to read/write the bpf map or use the bpf program.
This check is necessary because the bpf maps and programs are using a
anonymous inode as their shared inode so the normal way of checking the
files and sockets when passing between processes cannot work properly on
eBPF object. This check only works when the BPF_SYSCALL is configured.

Signed-off-by: Chenbo Feng <fengc@google.com>
Acked-by: Stephen Smalley <sds@tycho.nsa.gov>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  3 +++
 kernel/bpf/syscall.c     |  4 ++--
 security/selinux/hooks.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 84c192da3e0b..1e334b248ff6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -288,6 +288,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
 #define BPF_PROG_TYPE(_id, _name) \
 	extern const struct bpf_prog_ops _name ## _prog_ops; \
 	extern const struct bpf_verifier_ops _name ## _verifier_ops;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5cb56d06b48d..323be2473c4b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -315,7 +315,7 @@ static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
 	return -EINVAL;
 }
 
-static const struct file_operations bpf_map_fops = {
+const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
@@ -975,7 +975,7 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 12cf7de8cbed..2e3a627fc0b1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1815,6 +1815,10 @@ static inline int file_path_has_perm(const struct cred *cred,
 	return inode_has_perm(cred, file_inode(file), av, &ad);
 }
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
 /* Check whether a task can use an open file descriptor to
    access an inode in a given way.  Check access to the
    descriptor itself, and then use dentry_has_perm to
@@ -1845,6 +1849,12 @@ static int file_has_perm(const struct cred *cred,
 			goto out;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, cred_sid(cred));
+	if (rc)
+		return rc;
+#endif
+
 	/* av is zero if only checking access to the descriptor. */
 	rc = 0;
 	if (av)
@@ -2165,6 +2175,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
 			return rc;
 	}
 
+#ifdef CONFIG_BPF_SYSCALL
+	rc = bpf_fd_pass(file, sid);
+	if (rc)
+		return rc;
+#endif
+
 	if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
 		return 0;
 
@@ -6288,6 +6304,39 @@ static u32 bpf_map_fmode_to_av(fmode_t fmode)
 	return av;
 }
 
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+	struct bpf_security_struct *bpfsec;
+	struct bpf_prog *prog;
+	struct bpf_map *map;
+	int ret;
+
+	if (file->f_op == &bpf_map_fops) {
+		map = file->private_data;
+		bpfsec = map->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   bpf_map_fmode_to_av(file->f_mode), NULL);
+		if (ret)
+			return ret;
+	} else if (file->f_op == &bpf_prog_fops) {
+		prog = file->private_data;
+		bpfsec = prog->aux->security;
+		ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+				   BPF__PROG_RUN, NULL);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
 {
 	u32 sid = current_sid();
-- 
cgit v1.3-14-g43fede


From e4d0b679a8467b6d0c169642f0b5f57d8d6eacc2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Sep 2017 14:30:06 -0700
Subject: srcu: Add parameters to SRCU docbook comments

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/srcu.h  | 1 +
 kernel/rcu/srcutree.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 39af9bc0f653..62be8966e837 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -78,6 +78,7 @@ void synchronize_srcu(struct srcu_struct *sp);
 
 /**
  * srcu_read_lock_held - might we be in SRCU read-side critical section?
+ * @sp: The srcu_struct structure to check
  *
  * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
  * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 729a8706751d..6d5880089ff6 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -854,7 +854,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp,
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
  * @sp: srcu_struct in queue the callback
- * @head: structure to be used for queueing the SRCU callback.
+ * @rhp: structure to be used for queueing the SRCU callback.
  * @func: function to be invoked after the SRCU grace period
  *
  * The callback function will be invoked some time after a full SRCU
-- 
cgit v1.3-14-g43fede


From b5149873a0c299195b5346fe4dc2c5b04ae2f995 Mon Sep 17 00:00:00 2001
From: Tal Shorer <tal.shorer@gmail.com>
Date: Sat, 21 Oct 2017 19:29:24 +0300
Subject: workqueue: respect isolated cpus when queueing an unbound work

Initialize wq_unbound_cpumask to exclude cpus that were isolated by
the cmdline's isolcpus parameter.

Signed-off-by: Tal Shorer <tal.shorer@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..bfa433b38a61 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,6 +4980,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
+	/*
+	 * Not excluding isolated cpus on purpose.
+	 * If the user wishes to include them, we allow that.
+	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5579,7 +5583,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.3-14-g43fede


From 31749468c3f9d77927ed3144259dc208e6625ede Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Mon, 23 Oct 2017 19:39:28 +0200
Subject: bpf: cpumap fix potential lost wake-up problem

As pointed out by Michael, commit 1c601d829ab0 ("bpf: cpumap xdp_buff
to skb conversion and allocation") contains a classical example of the
potential lost wake-up problem.

We need to recheck the condition __ptr_ring_empty() after changing
current->state to TASK_INTERRUPTIBLE, this avoids a race between
wake_up_process() and schedule(). After this, a race with
wake_up_process() will simply change the state to TASK_RUNNING, and
the schedule() call not really put us to sleep.

Fixes: 1c601d829ab0 ("bpf: cpumap xdp_buff to skb conversion and allocation")
Reported-by: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cpumap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b4358d84ddf1..86e29cbf7827 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -288,13 +288,17 @@ static int cpu_map_kthread_run(void *data)
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
-			sched = 1;
+			set_current_state(TASK_INTERRUPTIBLE);
+			/* Recheck to avoid lost wake-up */
+			if (__ptr_ring_empty(rcpu->queue)) {
+				schedule();
+				sched = 1;
+			} else {
+				__set_current_state(TASK_RUNNING);
+			}
 		} else {
 			sched = cond_resched();
 		}
-		__set_current_state(TASK_RUNNING);
 
 		/* Process packets in rcpu->queue */
 		local_bh_disable();
-- 
cgit v1.3-14-g43fede


From e22cdc3fc5991956146b9856d36b4971fe54dcd6 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 23 Oct 2017 19:01:54 +0600
Subject: sched/isolcpus: Fix "isolcpus=" boot parameter handling when
 !CONFIG_CPUMASK_OFFSTACK

cpulist_parse() uses nr_cpumask_bits as a limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:

 - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
   NR_CPUS, which might not represent the # of CPUs that really exist
   (default 64). So, there's a chance of a gap between nr_cpu_ids
   and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
   operation. For example, if isolcpus=9 is passed on an 8 cpu
   system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
   that it's supposed to.

This patch fixes this bug by finding the last CPU of the passed
isolcpus= list and checking it against nr_cpu_ids.

It also fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since CPU numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: adobriyan@gmail.com
Cc: akpm@linux-foundation.org
Cc: longman@redhat.com
Cc: mka@chromium.org
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mullick@gmail.com
[ Enhanced the changelog and the kernel message. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

 include/linux/cpumask.h |   16 ++++++++++++++++
 kernel/sched/topology.c |    4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
---
 include/linux/cpumask.h | 16 ++++++++++++++++
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b733c2a..63661de67ad4 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:	- the cpumask pointer
+ *
+ * Returns	>= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e50450c2fed8..e3d31b0880dc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str)
 
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
+	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.3-14-g43fede


From 506458efaf153c1ea480591c5602a5a3ba5a3b76 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Tue, 24 Oct 2017 11:22:48 +0100
Subject: locking/barriers: Convert users of lockless_dereference() to
 READ_ONCE()

READ_ONCE() now has an implicit smp_read_barrier_depends() call, so it
can be used instead of lockless_dereference() without any change in
semantics.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1508840570-22169-4-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/events/core.c             |  2 +-
 arch/x86/include/asm/mmu_context.h |  4 ++--
 arch/x86/kernel/ldt.c              |  2 +-
 drivers/md/dm-mpath.c              | 20 ++++++++++----------
 fs/dcache.c                        |  4 ++--
 fs/overlayfs/ovl_entry.h           |  2 +-
 fs/overlayfs/readdir.c             |  2 +-
 include/linux/rculist.h            |  4 ++--
 include/linux/rcupdate.h           |  4 ++--
 kernel/events/core.c               |  4 ++--
 kernel/seccomp.c                   |  2 +-
 kernel/task_work.c                 |  2 +-
 mm/slab.h                          |  2 +-
 13 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 80534d3c2480..589af1eec7c1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2371,7 +2371,7 @@ static unsigned long get_segment_base(unsigned int segment)
 		struct ldt_struct *ldt;
 
 		/* IRQs are off, so this synchronizes with smp_store_release */
-		ldt = lockless_dereference(current->active_mm->context.ldt);
+		ldt = READ_ONCE(current->active_mm->context.ldt);
 		if (!ldt || idx >= ldt->nr_entries)
 			return 0;
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 3c856a15b98e..efc530642f7d 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -72,8 +72,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
-	/* lockless_dereference synchronizes with smp_store_release */
-	ldt = lockless_dereference(mm->context.ldt);
+	/* READ_ONCE synchronizes with smp_store_release */
+	ldt = READ_ONCE(mm->context.ldt);
 
 	/*
 	 * Any change to mm->context.ldt is followed by an IPI to all
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f0e64db18ac8..0a21390642c4 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -101,7 +101,7 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)
 static void install_ldt(struct mm_struct *current_mm,
 			struct ldt_struct *ldt)
 {
-	/* Synchronizes with lockless_dereference in load_mm_ldt. */
+	/* Synchronizes with READ_ONCE in load_mm_ldt. */
 	smp_store_release(&current_mm->context.ldt, ldt);
 
 	/* Activate the LDT for all CPUs using current_mm. */
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 11f273d2f018..3f88c9d32f7e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -366,7 +366,7 @@ static struct pgpath *choose_path_in_pg(struct multipath *m,
 
 	pgpath = path_to_pgpath(path);
 
-	if (unlikely(lockless_dereference(m->current_pg) != pg)) {
+	if (unlikely(READ_ONCE(m->current_pg) != pg)) {
 		/* Only update current_pgpath if pg changed */
 		spin_lock_irqsave(&m->lock, flags);
 		m->current_pgpath = pgpath;
@@ -390,7 +390,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 	}
 
 	/* Were we instructed to switch PG? */
-	if (lockless_dereference(m->next_pg)) {
+	if (READ_ONCE(m->next_pg)) {
 		spin_lock_irqsave(&m->lock, flags);
 		pg = m->next_pg;
 		if (!pg) {
@@ -406,7 +406,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
 
 	/* Don't change PG until it has no remaining paths */
 check_current_pg:
-	pg = lockless_dereference(m->current_pg);
+	pg = READ_ONCE(m->current_pg);
 	if (pg) {
 		pgpath = choose_path_in_pg(m, pg, nr_bytes);
 		if (!IS_ERR_OR_NULL(pgpath))
@@ -473,7 +473,7 @@ static int multipath_clone_and_map(struct dm_target *ti, struct request *rq,
 	struct request *clone;
 
 	/* Do we need to select a new pgpath? */
-	pgpath = lockless_dereference(m->current_pgpath);
+	pgpath = READ_ONCE(m->current_pgpath);
 	if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
 		pgpath = choose_pgpath(m, nr_bytes);
 
@@ -535,7 +535,7 @@ static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_m
 	bool queue_io;
 
 	/* Do we need to select a new pgpath? */
-	pgpath = lockless_dereference(m->current_pgpath);
+	pgpath = READ_ONCE(m->current_pgpath);
 	queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags);
 	if (!pgpath || !queue_io)
 		pgpath = choose_pgpath(m, nr_bytes);
@@ -1804,7 +1804,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 	struct pgpath *current_pgpath;
 	int r;
 
-	current_pgpath = lockless_dereference(m->current_pgpath);
+	current_pgpath = READ_ONCE(m->current_pgpath);
 	if (!current_pgpath)
 		current_pgpath = choose_pgpath(m, 0);
 
@@ -1826,7 +1826,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
 	}
 
 	if (r == -ENOTCONN) {
-		if (!lockless_dereference(m->current_pg)) {
+		if (!READ_ONCE(m->current_pg)) {
 			/* Path status changed, redo selection */
 			(void) choose_pgpath(m, 0);
 		}
@@ -1895,9 +1895,9 @@ static int multipath_busy(struct dm_target *ti)
 		return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED);
 
 	/* Guess which priority_group will be used at next mapping time */
-	pg = lockless_dereference(m->current_pg);
-	next_pg = lockless_dereference(m->next_pg);
-	if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
+	pg = READ_ONCE(m->current_pg);
+	next_pg = READ_ONCE(m->next_pg);
+	if (unlikely(!READ_ONCE(m->current_pgpath) && next_pg))
 		pg = next_pg;
 
 	if (!pg) {
diff --git a/fs/dcache.c b/fs/dcache.c
index f90141387f01..34c852af215c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -231,7 +231,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 {
 	/*
 	 * Be careful about RCU walk racing with rename:
-	 * use 'lockless_dereference' to fetch the name pointer.
+	 * use 'READ_ONCE' to fetch the name pointer.
 	 *
 	 * NOTE! Even if a rename will mean that the length
 	 * was not loaded atomically, we don't care. The
@@ -245,7 +245,7 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
 	 * early because the data cannot match (there can
 	 * be no NUL in the ct/tcount data)
 	 */
-	const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+	const unsigned char *cs = READ_ONCE(dentry->d_name.name);
 
 	return dentry_string_cmp(cs, ct, tcount);
 }
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 25d9b5adcd42..36b49bd09264 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -77,5 +77,5 @@ static inline struct ovl_inode *OVL_I(struct inode *inode)
 
 static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
 {
-	return lockless_dereference(oi->__upperdentry);
+	return READ_ONCE(oi->__upperdentry);
 }
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 0f85ee9c3268..c67a7703296b 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -754,7 +754,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 	if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
 		struct inode *inode = file_inode(file);
 
-		realfile = lockless_dereference(od->upperfile);
+		realfile = READ_ONCE(od->upperfile);
 		if (!realfile) {
 			struct path upperpath;
 
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 2bea1d5e9930..5ed091c064b2 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -274,7 +274,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
  */
 #define list_entry_rcu(ptr, type, member) \
-	container_of(lockless_dereference(ptr), type, member)
+	container_of(READ_ONCE(ptr), type, member)
 
 /*
  * Where are list_empty_rcu() and list_first_entry_rcu()?
@@ -367,7 +367,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list,
  * example is when items are added to the list, but never deleted.
  */
 #define list_entry_lockless(ptr, type, member) \
-	container_of((typeof(ptr))lockless_dereference(ptr), type, member)
+	container_of((typeof(ptr))READ_ONCE(ptr), type, member)
 
 /**
  * list_for_each_entry_lockless - iterate over rcu list of given type
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 1a9f70d44af9..a6ddc42f87a5 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -346,7 +346,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define __rcu_dereference_check(p, c, space) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(*p) *________p1 = (typeof(*p) *__force)lockless_dereference(p); \
+	typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \
 	RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
 	rcu_dereference_sparse(p, space); \
 	((typeof(*p) __force __kernel *)(________p1)); \
@@ -360,7 +360,7 @@ static inline void rcu_preempt_sleep_check(void) { }
 #define rcu_dereference_raw(p) \
 ({ \
 	/* Dependency order vs. p above. */ \
-	typeof(p) ________p1 = lockless_dereference(p); \
+	typeof(p) ________p1 = READ_ONCE(p); \
 	((typeof(*p) __force __kernel *)(________p1)); \
 })
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9d93db81fa36..824a583079a1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4231,7 +4231,7 @@ static void perf_remove_from_owner(struct perf_event *event)
 	 * indeed free this event, otherwise we need to serialize on
 	 * owner->perf_event_mutex.
 	 */
-	owner = lockless_dereference(event->owner);
+	owner = READ_ONCE(event->owner);
 	if (owner) {
 		/*
 		 * Since delayed_put_task_struct() also drops the last
@@ -4328,7 +4328,7 @@ again:
 		 * Cannot change, child events are not migrated, see the
 		 * comment with perf_event_ctx_lock_nested().
 		 */
-		ctx = lockless_dereference(child->ctx);
+		ctx = READ_ONCE(child->ctx);
 		/*
 		 * Since child_mutex nests inside ctx::mutex, we must jump
 		 * through hoops. We start by grabbing a reference on the ctx.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0ae832e13b97..8ac79355915b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -189,7 +189,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
 	u32 ret = SECCOMP_RET_ALLOW;
 	/* Make sure cross-thread synced filter points somewhere sane. */
 	struct seccomp_filter *f =
-			lockless_dereference(current->seccomp.filter);
+			READ_ONCE(current->seccomp.filter);
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (unlikely(WARN_ON(f == NULL)))
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 836a72a66fba..9a9f262fc53d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -67,7 +67,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 * we raced with task_work_run(), *pprev == NULL/exited.
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
-	while ((work = lockless_dereference(*pprev))) {
+	while ((work = READ_ONCE(*pprev))) {
 		if (work->func != func)
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)
diff --git a/mm/slab.h b/mm/slab.h
index 073362816acc..8894f811a89d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -258,7 +258,7 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
 	 * memcg_caches issues a write barrier to match this (see
 	 * memcg_create_kmem_cache()).
 	 */
-	cachep = lockless_dereference(arr->entries[idx]);
+	cachep = READ_ONCE(arr->entries[idx]);
 	rcu_read_unlock();
 
 	return cachep;
-- 
cgit v1.3-14-g43fede


From 0b4c6841fee03e096b735074a0c4aab3a8e92986 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:07 -0700
Subject: bpf: use the same condition in perf event set/free bpf handler

This is a cleanup such that doing the same check in
perf_event_free_bpf_prog as we already do in
perf_event_set_bpf_prog step.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/events/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 31ee304a5844..9f78a6825bbe 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8191,10 +8191,10 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
 {
 	struct bpf_prog *prog;
 
-	perf_event_free_bpf_handler(event);
-
-	if (!event->tp_event)
+	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+		perf_event_free_bpf_handler(event);
 		return;
+	}
 
 	prog = event->tp_event->prog;
 	if (prog && event->tp_event->bpf_prog_owner == event) {
-- 
cgit v1.3-14-g43fede


From e87c6bc3852b981e71c757be20771546ce9f76f3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 23 Oct 2017 23:53:08 -0700
Subject: bpf: permit multiple bpf attachments for a single perf event

This patch enables multiple bpf attachments for a
kprobe/uprobe/tracepoint single trace event.
Each trace_event keeps a list of attached perf events.
When an event happens, all attached bpf programs will
be executed based on the order of attachment.

A global bpf_event_mutex lock is introduced to protect
prog_array attaching and detaching. An alternative will
be introduce a mutex lock in every trace_event_call
structure, but it takes a lot of extra memory.
So a global bpf_event_mutex lock is a good compromise.

The bpf prog detachment involves allocation of memory.
If the allocation fails, a dummy do-nothing program
will replace to-be-detached program in-place.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h           | 30 +++++++++++++---
 include/linux/trace_events.h  | 43 ++++++++++++++++++++---
 include/trace/perf.h          |  6 ++--
 kernel/bpf/core.c             | 81 ++++++++++++++++++++++++++++++++++++++++++
 kernel/events/core.c          | 26 +++++---------
 kernel/trace/bpf_trace.c      | 82 ++++++++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_kprobe.c   |  6 ++--
 kernel/trace/trace_syscalls.c | 34 ++++++++++--------
 kernel/trace/trace_uprobe.c   |  3 +-
 9 files changed, 255 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1e334b248ff6..172be7faf7ba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -273,18 +273,38 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
 int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 				__u32 __user *prog_ids, u32 cnt);
 
-#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog);
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array);
+
+#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
 	({						\
-		struct bpf_prog **_prog;		\
+		struct bpf_prog **_prog, *__prog;	\
+		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
 		rcu_read_lock();			\
-		_prog = rcu_dereference(array)->progs;	\
-		for (; *_prog; _prog++)			\
-			_ret &= func(*_prog, ctx);	\
+		_array = rcu_dereference(array);	\
+		if (unlikely(check_non_null && !_array))\
+			goto _out;			\
+		_prog = _array->progs;			\
+		while ((__prog = READ_ONCE(*_prog))) {	\
+			_ret &= func(__prog, ctx);	\
+			_prog++;			\
+		}					\
+_out:							\
 		rcu_read_unlock();			\
 		_ret;					\
 	 })
 
+#define BPF_PROG_RUN_ARRAY(array, ctx, func)		\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+
+#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)	\
+	__BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
 
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2e0f22298fe9..fc6aeca945db 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -271,14 +271,37 @@ struct trace_event_call {
 #ifdef CONFIG_PERF_EVENTS
 	int				perf_refcount;
 	struct hlist_head __percpu	*perf_events;
-	struct bpf_prog			*prog;
-	struct perf_event		*bpf_prog_owner;
+	struct bpf_prog_array __rcu	*prog_array;
 
 	int	(*perf_perm)(struct trace_event_call *,
 			     struct perf_event *);
 #endif
 };
 
+#ifdef CONFIG_PERF_EVENTS
+static inline bool bpf_prog_array_valid(struct trace_event_call *call)
+{
+	/*
+	 * This inline function checks whether call->prog_array
+	 * is valid or not. The function is called in various places,
+	 * outside rcu_read_lock/unlock, as a heuristic to speed up execution.
+	 *
+	 * If this function returns true, and later call->prog_array
+	 * becomes false inside rcu_read_lock/unlock region,
+	 * we bail out then. If this function return false,
+	 * there is a risk that we might miss a few events if the checking
+	 * were delayed until inside rcu_read_lock/unlock region and
+	 * call->prog_array happened to become non-NULL then.
+	 *
+	 * Here, READ_ONCE() is used instead of rcu_access_pointer().
+	 * rcu_access_pointer() requires the actual definition of
+	 * "struct bpf_prog_array" while READ_ONCE() only needs
+	 * a declaration of the same type.
+	 */
+	return !!READ_ONCE(call->prog_array);
+}
+#endif
+
 static inline const char *
 trace_event_name(struct trace_event_call *call)
 {
@@ -435,12 +458,23 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
 }
 
 #ifdef CONFIG_BPF_EVENTS
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx);
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx);
+int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog);
+void perf_event_detach_bpf_prog(struct perf_event *event);
 #else
-static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	return 1;
 }
+
+static inline int
+perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void perf_event_detach_bpf_prog(struct perf_event *event) { }
+
 #endif
 
 enum {
@@ -511,6 +545,7 @@ perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type,
 {
 	perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event);
 }
+
 #endif
 
 #endif /* _LINUX_TRACE_EVENT_H */
diff --git a/include/trace/perf.h b/include/trace/perf.h
index 04fe68bbe767..14f127b6acf5 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -34,7 +34,6 @@ perf_trace_##call(void *__data, proto)					\
 	struct trace_event_call *event_call = __data;			\
 	struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\
 	struct trace_event_raw_##call *entry;				\
-	struct bpf_prog *prog = event_call->prog;			\
 	struct pt_regs *__regs;						\
 	u64 __count = 1;						\
 	struct task_struct *__task = NULL;				\
@@ -46,8 +45,9 @@ perf_trace_##call(void *__data, proto)					\
 	__data_size = trace_event_get_offsets_##call(&__data_offsets, args); \
 									\
 	head = this_cpu_ptr(event_call->perf_events);			\
-	if (!prog && __builtin_constant_p(!__task) && !__task &&	\
-				hlist_empty(head))			\
+	if (!bpf_prog_array_valid(event_call) &&			\
+	    __builtin_constant_p(!__task) && !__task &&			\
+	    hlist_empty(head))						\
 		return;							\
 									\
 	__entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8e7c8bf2b687..7fe448799d76 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1394,6 +1394,20 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+static unsigned int __bpf_prog_ret1(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 1;
+}
+
+static struct bpf_prog_dummy {
+	struct bpf_prog prog;
+} dummy_bpf_prog = {
+	.prog = {
+		.bpf_func = __bpf_prog_ret1,
+	},
+};
+
 /* to avoid allocating empty bpf_prog_array for cgroups that
  * don't have bpf program attached use one global 'empty_prog_array'
  * It will not be modified the caller of bpf_prog_array_alloc()
@@ -1463,6 +1477,73 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
 	return 0;
 }
 
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog)
+{
+	struct bpf_prog **prog = progs->progs;
+
+	for (; *prog; prog++)
+		if (*prog == old_prog) {
+			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+			break;
+		}
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array)
+{
+	int new_prog_cnt, carry_prog_cnt = 0;
+	struct bpf_prog **existing_prog;
+	struct bpf_prog_array *array;
+	int new_prog_idx = 0;
+
+	/* Figure out how many existing progs we need to carry over to
+	 * the new array.
+	 */
+	if (old_array) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++) {
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				carry_prog_cnt++;
+			if (*existing_prog == include_prog)
+				return -EEXIST;
+		}
+	}
+
+	/* How many progs (not NULL) will be in the new array? */
+	new_prog_cnt = carry_prog_cnt;
+	if (include_prog)
+		new_prog_cnt += 1;
+
+	/* Do we have any prog (not NULL) in the new array? */
+	if (!new_prog_cnt) {
+		*new_array = NULL;
+		return 0;
+	}
+
+	/* +1 as the end of prog_array is marked with NULL */
+	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	/* Fill in the new prog array */
+	if (carry_prog_cnt) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++)
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				array->progs[new_prog_idx++] = *existing_prog;
+	}
+	if (include_prog)
+		array->progs[new_prog_idx++] = include_prog;
+	array->progs[new_prog_idx] = NULL;
+	*new_array = array;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9f78a6825bbe..9660ee65fbef 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7954,11 +7954,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	struct bpf_prog *prog = call->prog;
-
-	if (prog) {
+	if (bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
-		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
 			return;
 		}
@@ -8147,13 +8145,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
+	int ret;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return perf_event_set_bpf_handler(event, prog_fd);
 
-	if (event->tp_event->prog)
-		return -EEXIST;
-
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 	is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8181,26 +8177,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 			return -EACCES;
 		}
 	}
-	event->tp_event->prog = prog;
-	event->tp_event->bpf_prog_owner = event;
 
-	return 0;
+	ret = perf_event_attach_bpf_prog(event, prog);
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
 }
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	struct bpf_prog *prog;
-
 	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 		perf_event_free_bpf_handler(event);
 		return;
 	}
-
-	prog = event->tp_event->prog;
-	if (prog && event->tp_event->bpf_prog_owner == event) {
-		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
-	}
+	perf_event_detach_bpf_prog(event);
 }
 
 #else
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3126da2f468a..b65011d320e3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,7 +17,7 @@
 
 /**
  * trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
  * @ctx: opaque context pointer
  *
  * kprobe handlers execute BPF programs via this helper.
@@ -29,7 +29,7 @@
  * 1 - store kprobe event into ring buffer
  * Other values are reserved and currently alias to 1
  */
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
@@ -49,9 +49,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 		goto out;
 	}
 
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, ctx);
-	rcu_read_unlock();
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heurisitc to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into trace_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -741,3 +754,62 @@ const struct bpf_verifier_ops perf_event_verifier_ops = {
 
 const struct bpf_prog_ops perf_event_prog_ops = {
 };
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+			       struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret = -EEXIST;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto out;
+
+	/* set the new array to event->tp_event and set event->prog */
+	event->prog = prog;
+	rcu_assign_pointer(event->tp_event->prog_array, new_array);
+	bpf_prog_array_free(old_array);
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+	return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (!event->prog)
+		goto out;
+
+	old_array = rcu_dereference_protected(event->tp_event->prog_array,
+					      lockdep_is_held(&bpf_event_mutex));
+
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret < 0) {
+		bpf_prog_array_delete_safe(old_array, event->prog);
+	} else {
+		rcu_assign_pointer(event->tp_event->prog_array, new_array);
+		bpf_prog_array_free(old_array);
+	}
+
+	bpf_prog_put(event->prog);
+	event->prog = NULL;
+
+out:
+	mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1174,13 +1174,12 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 696afe72d3b1..71a6af34d7a9 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -559,9 +559,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_metadata *sys_data,
-			      struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+			       struct syscall_metadata *sys_data,
+			       struct syscall_trace_enter *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -573,7 +574,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
 		param.args[i] = rec->args[i];
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -581,7 +582,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -596,9 +597,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -614,7 +615,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -659,8 +661,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -670,7 +673,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -678,7 +681,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -693,9 +696,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -709,7 +712,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..153c0e411461 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
-	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-- 
cgit v1.3-14-g43fede


From e0d02285f16e8d5810f3d5d5e8a5886ca0015d3b Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 12 Oct 2017 13:20:47 +0100
Subject: locking/qrwlock: Use 'struct qrwlock' instead of 'struct __qrwlock'

There's no good reason to keep the internal structure of struct qrwlock
hidden from qrwlock.h, particularly as it's actually needed for unlock
and ends up being abstracted independently behind the __qrwlock_write_byte()
function.

Stop pretending we can hide this stuff, and move the __qrwlock definition
into qrwlock, removing the __qrwlock_write_byte() nastiness and using the
same struct definition everywhere instead.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Jeremy.Linton@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <longman@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1507810851-306-2-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qrwlock.h       | 12 +-----------
 include/asm-generic/qrwlock_types.h | 15 +++++++++++++--
 kernel/locking/qrwlock.c            | 26 ++------------------------
 3 files changed, 16 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 50925327b0a8..02c0a768e6b0 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -128,23 +128,13 @@ static inline void queued_read_unlock(struct qrwlock *lock)
 	(void)atomic_sub_return_release(_QR_BIAS, &lock->cnts);
 }
 
-/**
- * __qrwlock_write_byte - retrieve the write byte address of a queue rwlock
- * @lock : Pointer to queue rwlock structure
- * Return: the write byte address of a queue rwlock
- */
-static inline u8 *__qrwlock_write_byte(struct qrwlock *lock)
-{
-	return (u8 *)lock + 3 * IS_BUILTIN(CONFIG_CPU_BIG_ENDIAN);
-}
-
 /**
  * queued_write_unlock - release write lock of a queue rwlock
  * @lock : Pointer to queue rwlock structure
  */
 static inline void queued_write_unlock(struct qrwlock *lock)
 {
-	smp_store_release(__qrwlock_write_byte(lock), 0);
+	smp_store_release(&lock->wmode, 0);
 }
 
 /*
diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h
index 0abc6b6062fb..507f2dc51bba 100644
--- a/include/asm-generic/qrwlock_types.h
+++ b/include/asm-generic/qrwlock_types.h
@@ -9,12 +9,23 @@
  */
 
 typedef struct qrwlock {
-	atomic_t		cnts;
+	union {
+		atomic_t cnts;
+		struct {
+#ifdef __LITTLE_ENDIAN
+			u8 wmode;	/* Writer mode   */
+			u8 rcnts[3];	/* Reader counts */
+#else
+			u8 rcnts[3];	/* Reader counts */
+			u8 wmode;	/* Writer mode   */
+#endif
+		};
+	};
 	arch_spinlock_t		wait_lock;
 } arch_rwlock_t;
 
 #define	__ARCH_RW_LOCK_UNLOCKED {		\
-	.cnts = ATOMIC_INIT(0),			\
+	{ .cnts = ATOMIC_INIT(0), },		\
 	.wait_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
 }
 
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..1af791e37348 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,26 +23,6 @@
 #include <linux/spinlock.h>
 #include <asm/qrwlock.h>
 
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
-	union {
-		atomic_t cnts;
-		struct {
-#ifdef __LITTLE_ENDIAN
-			u8 wmode;	/* Writer mode   */
-			u8 rcnts[3];	/* Reader counts */
-#else
-			u8 rcnts[3];	/* Reader counts */
-			u8 wmode;	/* Writer mode   */
-#endif
-		};
-	};
-	arch_spinlock_t	lock;
-};
-
 /**
  * rspin_until_writer_unlock - inc reader count & spin until writer is gone
  * @lock  : Pointer to queue rwlock structure
@@ -125,10 +105,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	 * or wait for a previous writer to go away.
 	 */
 	for (;;) {
-		struct __qrwlock *l = (struct __qrwlock *)lock;
-
-		if (!READ_ONCE(l->wmode) &&
-		   (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
+		if (!READ_ONCE(lock->wmode) &&
+		   (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0))
 			break;
 
 		cpu_relax();
-- 
cgit v1.3-14-g43fede


From b519b56e378ee82caf9b079b04f5db87dedc3251 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 12 Oct 2017 13:20:49 +0100
Subject: locking/qrwlock: Use atomic_cond_read_acquire() when spinning in
 qrwlock

The qrwlock slowpaths involve spinning when either a prospective reader
is waiting for a concurrent writer to drain, or a prospective writer is
waiting for concurrent readers to drain. In both of these situations,
atomic_cond_read_acquire() can be used to avoid busy-waiting and make use
of any backoff functionality provided by the architecture.

This patch replaces the open-code loops and rspin_until_writer_unlock()
implementation with atomic_cond_read_acquire(). The write mode transition
zero to _QW_WAITING is left alone, since (a) this doesn't need acquire
semantics and (b) should be fast.

Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Adam Wallis <awallis@codeaurora.org>
Tested-by: Jan Glauber <jglauber@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Jeremy.Linton@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1507810851-306-4-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qrwlock.h |  4 ++--
 kernel/locking/qrwlock.c      | 50 +++++++++++--------------------------------
 2 files changed, 14 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 02c0a768e6b0..c716b02e8fd7 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -49,7 +49,7 @@
 /*
  * External function declarations
  */
-extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts);
+extern void queued_read_lock_slowpath(struct qrwlock *lock);
 extern void queued_write_lock_slowpath(struct qrwlock *lock);
 
 /**
@@ -100,7 +100,7 @@ static inline void queued_read_lock(struct qrwlock *lock)
 		return;
 
 	/* The slowpath will decrement the reader count, if necessary. */
-	queued_read_lock_slowpath(lock, cnts);
+	queued_read_lock_slowpath(lock);
 }
 
 /**
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 1af791e37348..5825e0fc1a8e 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,29 +23,11 @@
 #include <linux/spinlock.h>
 #include <asm/qrwlock.h>
 
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock  : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
-	while ((cnts & _QW_WMASK) == _QW_LOCKED) {
-		cpu_relax();
-		cnts = atomic_read_acquire(&lock->cnts);
-	}
-}
-
 /**
  * queued_read_lock_slowpath - acquire read lock of a queue rwlock
  * @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
  */
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void queued_read_lock_slowpath(struct qrwlock *lock)
 {
 	/*
 	 * Readers come here when they cannot get the lock without waiting
@@ -53,13 +35,12 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	if (unlikely(in_interrupt())) {
 		/*
 		 * Readers in interrupt context will get the lock immediately
-		 * if the writer is just waiting (not holding the lock yet).
-		 * The rspin_until_writer_unlock() function returns immediately
-		 * in this case. Otherwise, they will spin (with ACQUIRE
-		 * semantics) until the lock is available without waiting in
-		 * the queue.
+		 * if the writer is just waiting (not holding the lock yet),
+		 * so spin with ACQUIRE semantics until the lock is available
+		 * without waiting in the queue.
 		 */
-		rspin_until_writer_unlock(lock, cnts);
+		atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK)
+					 != _QW_LOCKED);
 		return;
 	}
 	atomic_sub(_QR_BIAS, &lock->cnts);
@@ -68,14 +49,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
 	 * Put the reader into the wait queue
 	 */
 	arch_spin_lock(&lock->wait_lock);
+	atomic_add(_QR_BIAS, &lock->cnts);
 
 	/*
 	 * The ACQUIRE semantics of the following spinning code ensure
 	 * that accesses can't leak upwards out of our subsequent critical
 	 * section in the case that the lock is currently held for write.
 	 */
-	cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
-	rspin_until_writer_unlock(lock, cnts);
+	atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED);
 
 	/*
 	 * Signal the next one in queue to become queue head
@@ -90,8 +71,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
  */
 void queued_write_lock_slowpath(struct qrwlock *lock)
 {
-	u32 cnts;
-
 	/* Put the writer into the wait queue */
 	arch_spin_lock(&lock->wait_lock);
 
@@ -113,15 +92,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	}
 
 	/* When no more readers, set the locked flag */
-	for (;;) {
-		cnts = atomic_read(&lock->cnts);
-		if ((cnts == _QW_WAITING) &&
-		    (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
-					    _QW_LOCKED) == _QW_WAITING))
-			break;
-
-		cpu_relax();
-	}
+	do {
+		atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
+	} while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
+					_QW_LOCKED) != _QW_WAITING);
 unlock:
 	arch_spin_unlock(&lock->wait_lock);
 }
-- 
cgit v1.3-14-g43fede


From d133166146333e1f13fc81c0e6c43c8d99290a8a Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 12 Oct 2017 13:20:51 +0100
Subject: locking/qrwlock: Prevent slowpath writers getting held up by fastpath

When a prospective writer takes the qrwlock locking slowpath due to the
lock being held, it attempts to cmpxchg the wmode field from 0 to
_QW_WAITING so that concurrent lockers also take the slowpath and queue
on the spinlock accordingly, allowing the lockers to drain.

Unfortunately, this isn't fair, because a fastpath writer that comes in
after the lock is made available but before the _QW_WAITING flag is set
can effectively jump the queue. If there is a steady stream of prospective
writers, then the waiter will be held off indefinitely.

This patch restores fairness by separating _QW_WAITING and _QW_LOCKED
into two distinct fields: _QW_LOCKED continues to occupy the bottom byte
of the lockword so that it can be cleared unconditionally when unlocking,
but _QW_WAITING now occupies what used to be the bottom bit of the reader
count. This then forces the slow-path for concurrent lockers.

Tested-by: Waiman Long <longman@redhat.com>
Tested-by: Jeremy Linton <jeremy.linton@arm.com>
Tested-by: Adam Wallis <awallis@codeaurora.org>
Tested-by: Jan Glauber <jglauber@cavium.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Jeremy.Linton@arm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1507810851-306-6-git-send-email-will.deacon@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/qrwlock.h       | 23 +++++------------------
 include/asm-generic/qrwlock_types.h |  8 ++++----
 kernel/locking/qrwlock.c            | 20 +++++---------------
 3 files changed, 14 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index c716b02e8fd7..0f7062bd55e5 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -26,24 +26,11 @@
 
 /*
  * Writer states & reader shift and bias.
- *
- *       | +0 | +1 | +2 | +3 |
- *   ----+----+----+----+----+
- *    LE | 78 | 56 | 34 | 12 | 0x12345678
- *   ----+----+----+----+----+
- *       | wr |      rd      |
- *       +----+----+----+----+
- *
- *   ----+----+----+----+----+
- *    BE | 12 | 34 | 56 | 78 | 0x12345678
- *   ----+----+----+----+----+
- *       |      rd      | wr |
- *       +----+----+----+----+
  */
-#define	_QW_WAITING	1		/* A writer is waiting	   */
-#define	_QW_LOCKED	0xff		/* A writer holds the lock */
-#define	_QW_WMASK	0xff		/* Writer mask		   */
-#define	_QR_SHIFT	8		/* Reader count shift	   */
+#define	_QW_WAITING	0x100		/* A writer is waiting	   */
+#define	_QW_LOCKED	0x0ff		/* A writer holds the lock */
+#define	_QW_WMASK	0x1ff		/* Writer mask		   */
+#define	_QR_SHIFT	9		/* Reader count shift	   */
 #define _QR_BIAS	(1U << _QR_SHIFT)
 
 /*
@@ -134,7 +121,7 @@ static inline void queued_read_unlock(struct qrwlock *lock)
  */
 static inline void queued_write_unlock(struct qrwlock *lock)
 {
-	smp_store_release(&lock->wmode, 0);
+	smp_store_release(&lock->wlocked, 0);
 }
 
 /*
diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h
index 507f2dc51bba..8af752acbdc0 100644
--- a/include/asm-generic/qrwlock_types.h
+++ b/include/asm-generic/qrwlock_types.h
@@ -13,11 +13,11 @@ typedef struct qrwlock {
 		atomic_t cnts;
 		struct {
 #ifdef __LITTLE_ENDIAN
-			u8 wmode;	/* Writer mode   */
-			u8 rcnts[3];	/* Reader counts */
+			u8 wlocked;	/* Locked for write? */
+			u8 __lstate[3];
 #else
-			u8 rcnts[3];	/* Reader counts */
-			u8 wmode;	/* Writer mode   */
+			u8 __lstate[3];
+			u8 wlocked;	/* Locked for write? */
 #endif
 		};
 	};
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 5825e0fc1a8e..c7471c3fb798 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -39,8 +39,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
 		 * so spin with ACQUIRE semantics until the lock is available
 		 * without waiting in the queue.
 		 */
-		atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK)
-					 != _QW_LOCKED);
+		atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
 		return;
 	}
 	atomic_sub(_QR_BIAS, &lock->cnts);
@@ -56,7 +55,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
 	 * that accesses can't leak upwards out of our subsequent critical
 	 * section in the case that the lock is currently held for write.
 	 */
-	atomic_cond_read_acquire(&lock->cnts, (VAL & _QW_WMASK) != _QW_LOCKED);
+	atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
 
 	/*
 	 * Signal the next one in queue to become queue head
@@ -79,19 +78,10 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 	    (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
 		goto unlock;
 
-	/*
-	 * Set the waiting flag to notify readers that a writer is pending,
-	 * or wait for a previous writer to go away.
-	 */
-	for (;;) {
-		if (!READ_ONCE(lock->wmode) &&
-		   (cmpxchg_relaxed(&lock->wmode, 0, _QW_WAITING) == 0))
-			break;
-
-		cpu_relax();
-	}
+	/* Set the waiting flag to notify readers that a writer is pending */
+	atomic_add(_QW_WAITING, &lock->cnts);
 
-	/* When no more readers, set the locked flag */
+	/* When no more readers or writers, set the locked flag */
 	do {
 		atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
 	} while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
-- 
cgit v1.3-14-g43fede


From c95491ed6d6a958743d82bea1d053819988da418 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:22 -0700
Subject: locking/atomics, workqueue: Convert ACCESS_ONCE() to
 READ_ONCE()/WRITE_ONCE()

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't currently harmful.

However, for some features it is necessary to instrument reads and
writes separately, which is not possible with ACCESS_ONCE(). This
distinction is critical to correct operation.

It's possible to transform the bulk of kernel code using the Coccinelle
script below. However, this doesn't handle comments, leaving references
to ACCESS_ONCE() instances which have been removed. As a preparatory
step, this patch converts the workqueue code and comments to use
{READ,WRITE}_ONCE() consistently.

----
virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-12-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..39831b2f3c5f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4647,7 +4647,7 @@ static void rebind_workers(struct worker_pool *pool)
 		 * concurrency management.  Note that when or whether
 		 * @worker clears REBOUND doesn't affect correctness.
 		 *
-		 * ACCESS_ONCE() is necessary because @worker->flags may be
+		 * WRITE_ONCE() is necessary because @worker->flags may be
 		 * tested without holding any lock in
 		 * wq_worker_waking_up().  Without it, NOT_RUNNING test may
 		 * fail incorrectly leading to premature concurrency
@@ -4656,7 +4656,7 @@ static void rebind_workers(struct worker_pool *pool)
 		WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
 		worker_flags |= WORKER_REBOUND;
 		worker_flags &= ~WORKER_UNBOUND;
-		ACCESS_ONCE(worker->flags) = worker_flags;
+		WRITE_ONCE(worker->flags, worker_flags);
 	}
 
 	spin_unlock_irq(&pool->lock);
-- 
cgit v1.3-14-g43fede


From 6aa7de059173a986114ac43b8f50b297a86f09a8 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 23 Oct 2017 14:07:29 -0700
Subject: locking/atomics: COCCINELLE/treewide: Convert trivial ACCESS_ONCE()
 patterns to READ_ONCE()/WRITE_ONCE()

Please do not apply this to mainline directly, instead please re-run the
coccinelle script shown below and apply its output.

For several reasons, it is desirable to use {READ,WRITE}_ONCE() in
preference to ACCESS_ONCE(), and new code is expected to use one of the
former. So far, there's been no reason to change most existing uses of
ACCESS_ONCE(), as these aren't harmful, and changing them results in
churn.

However, for some features, the read/write distinction is critical to
correct operation. To distinguish these cases, separate read/write
accessors must be used. This patch migrates (most) remaining
ACCESS_ONCE() instances to {READ,WRITE}_ONCE(), using the following
coccinelle script:

----
// Convert trivial ACCESS_ONCE() uses to equivalent READ_ONCE() and
// WRITE_ONCE()

// $ make coccicheck COCCI=/home/mark/once.cocci SPFLAGS="--include-headers" MODE=patch

virtual patch

@ depends on patch @
expression E1, E2;
@@

- ACCESS_ONCE(E1) = E2
+ WRITE_ONCE(E1, E2)

@ depends on patch @
expression E;
@@

- ACCESS_ONCE(E)
+ READ_ONCE(E)
----

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Cc: linux-arch@vger.kernel.org
Cc: mpe@ellerman.id.au
Cc: shuah@kernel.org
Cc: snitzer@redhat.com
Cc: thor.thayer@linux.intel.com
Cc: tj@kernel.org
Cc: viro@zeniv.linux.org.uk
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1508792849-3115-19-git-send-email-paulmck@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arc/kernel/smp.c                              |  2 +-
 arch/arm/include/asm/spinlock.h                    |  2 +-
 arch/arm/mach-tegra/cpuidle-tegra20.c              |  2 +-
 arch/arm/vdso/vgettimeofday.c                      |  2 +-
 arch/ia64/include/asm/spinlock.h                   |  8 ++---
 arch/mips/include/asm/vdso.h                       |  2 +-
 arch/mips/kernel/pm-cps.c                          |  2 +-
 arch/mn10300/kernel/mn10300-serial.c               |  4 +--
 arch/parisc/include/asm/atomic.h                   |  2 +-
 arch/powerpc/platforms/powernv/opal-msglog.c       |  2 +-
 arch/s390/include/asm/spinlock.h                   |  6 ++--
 arch/s390/lib/spinlock.c                           | 16 +++++-----
 arch/sparc/include/asm/atomic_32.h                 |  2 +-
 arch/tile/gxio/dma_queue.c                         |  4 +--
 arch/tile/include/gxio/dma_queue.h                 |  2 +-
 arch/tile/kernel/ptrace.c                          |  2 +-
 arch/x86/entry/common.c                            |  2 +-
 arch/x86/entry/vdso/vclock_gettime.c               |  2 +-
 arch/x86/events/core.c                             |  2 +-
 arch/x86/include/asm/vgtod.h                       |  2 +-
 arch/x86/kernel/espfix_64.c                        |  6 ++--
 arch/x86/kernel/nmi.c                              |  2 +-
 arch/x86/kvm/mmu.c                                 |  4 +--
 arch/x86/kvm/page_track.c                          |  2 +-
 arch/x86/xen/p2m.c                                 |  2 +-
 arch/xtensa/platforms/xtfpga/lcd.c                 | 14 ++++-----
 block/blk-wbt.c                                    |  2 +-
 drivers/base/core.c                                |  2 +-
 drivers/base/power/runtime.c                       |  4 +--
 drivers/char/random.c                              |  4 +--
 drivers/clocksource/bcm2835_timer.c                |  2 +-
 drivers/crypto/caam/jr.c                           |  4 +--
 drivers/crypto/nx/nx-842-powernv.c                 |  2 +-
 drivers/firewire/ohci.c                            | 10 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c          |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c            |  4 +--
 drivers/gpu/drm/amd/scheduler/gpu_scheduler.c      |  2 +-
 drivers/gpu/drm/radeon/radeon_gem.c                |  4 +--
 drivers/gpu/drm/vmwgfx/vmwgfx_surface.c            |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c              |  2 +-
 drivers/infiniband/hw/hfi1/pio.c                   |  6 ++--
 drivers/infiniband/hw/hfi1/ruc.c                   |  2 +-
 drivers/infiniband/hw/hfi1/sdma.c                  |  8 ++---
 drivers/infiniband/hw/hfi1/sdma.h                  |  2 +-
 drivers/infiniband/hw/hfi1/uc.c                    |  4 +--
 drivers/infiniband/hw/hfi1/ud.c                    |  4 +--
 drivers/infiniband/hw/hfi1/user_sdma.c             |  8 ++---
 drivers/infiniband/hw/qib/qib_ruc.c                |  2 +-
 drivers/infiniband/hw/qib/qib_uc.c                 |  4 +--
 drivers/infiniband/hw/qib/qib_ud.c                 |  4 +--
 drivers/infiniband/sw/rdmavt/qp.c                  |  6 ++--
 drivers/input/misc/regulator-haptic.c              |  2 +-
 drivers/md/dm-bufio.c                              | 10 +++---
 drivers/md/dm-kcopyd.c                             |  4 +--
 drivers/md/dm-stats.c                              | 36 +++++++++++-----------
 drivers/md/dm-switch.c                             |  2 +-
 drivers/md/dm-thin.c                               |  2 +-
 drivers/md/dm-verity-target.c                      |  2 +-
 drivers/md/dm.c                                    |  4 +--
 drivers/md/md.c                                    |  2 +-
 drivers/md/raid5.c                                 |  2 +-
 drivers/misc/mic/scif/scif_rb.c                    |  8 ++---
 drivers/misc/mic/scif/scif_rma_list.c              |  2 +-
 drivers/net/bonding/bond_alb.c                     |  2 +-
 drivers/net/bonding/bond_main.c                    |  6 ++--
 drivers/net/ethernet/chelsio/cxgb4/sge.c           |  4 +--
 drivers/net/ethernet/emulex/benet/be_main.c        |  2 +-
 drivers/net/ethernet/hisilicon/hip04_eth.c         |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c     |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c     |  2 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c        |  4 +--
 drivers/net/ethernet/intel/i40e/i40e_ptp.c         |  4 +--
 drivers/net/ethernet/intel/igb/e1000_regs.h        |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c          |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_common.h    |  4 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      |  8 ++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c       |  4 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c  |  2 +-
 drivers/net/ethernet/intel/ixgbevf/vf.h            |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c         | 12 ++++----
 drivers/net/ethernet/neterion/vxge/vxge-main.c     |  2 +-
 drivers/net/ethernet/sfc/ef10.c                    | 10 +++---
 drivers/net/ethernet/sfc/efx.c                     |  4 +--
 drivers/net/ethernet/sfc/falcon/efx.c              |  4 +--
 drivers/net/ethernet/sfc/falcon/falcon.c           |  4 +--
 drivers/net/ethernet/sfc/falcon/farch.c            |  8 ++---
 drivers/net/ethernet/sfc/falcon/nic.h              |  6 ++--
 drivers/net/ethernet/sfc/falcon/tx.c               |  6 ++--
 drivers/net/ethernet/sfc/farch.c                   |  8 ++---
 drivers/net/ethernet/sfc/nic.h                     |  6 ++--
 drivers/net/ethernet/sfc/ptp.c                     | 10 +++---
 drivers/net/ethernet/sfc/tx.c                      |  6 ++--
 drivers/net/ethernet/sun/niu.c                     |  4 +--
 drivers/net/tap.c                                  |  2 +-
 drivers/net/tun.c                                  |  4 +--
 drivers/net/wireless/ath/ath5k/desc.c              |  8 ++---
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.c    |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/ops.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c        |  4 +--
 drivers/net/wireless/intel/iwlwifi/pcie/rx.c       |  2 +-
 drivers/net/wireless/intel/iwlwifi/pcie/trans.c    | 10 +++---
 drivers/net/wireless/mac80211_hwsim.c              |  4 +--
 drivers/scsi/qla2xxx/qla_target.c                  |  2 +-
 drivers/target/target_core_user.c                  |  2 +-
 drivers/usb/class/cdc-wdm.c                        |  2 +-
 drivers/usb/core/devio.c                           |  2 +-
 drivers/usb/core/sysfs.c                           |  4 +--
 drivers/usb/gadget/udc/gr_udc.c                    |  4 +--
 drivers/usb/host/ohci-hcd.c                        |  2 +-
 drivers/usb/host/uhci-hcd.h                        |  4 +--
 drivers/vfio/vfio.c                                |  2 +-
 drivers/vhost/scsi.c                               |  2 +-
 fs/aio.c                                           |  2 +-
 fs/buffer.c                                        |  3 +-
 fs/crypto/keyinfo.c                                |  2 +-
 fs/direct-io.c                                     |  2 +-
 fs/exec.c                                          |  2 +-
 fs/fcntl.c                                         |  2 +-
 fs/fs_pin.c                                        |  4 +--
 fs/fuse/dev.c                                      |  2 +-
 fs/inode.c                                         |  2 +-
 fs/namei.c                                         |  4 +--
 fs/namespace.c                                     |  2 +-
 fs/nfs/dir.c                                       |  8 ++---
 fs/proc/array.c                                    |  2 +-
 fs/proc_namespace.c                                |  2 +-
 fs/splice.c                                        |  2 +-
 fs/userfaultfd.c                                   |  8 ++---
 fs/xfs/xfs_log_priv.h                              |  4 +--
 include/linux/bitops.h                             |  4 +--
 include/linux/dynamic_queue_limits.h               |  2 +-
 include/linux/huge_mm.h                            |  2 +-
 include/linux/if_team.h                            |  2 +-
 include/linux/llist.h                              |  2 +-
 include/linux/pm_runtime.h                         |  2 +-
 include/net/ip_vs.h                                |  6 ++--
 kernel/acct.c                                      |  4 +--
 kernel/events/core.c                               |  6 ++--
 kernel/events/ring_buffer.c                        |  2 +-
 kernel/exit.c                                      |  2 +-
 kernel/trace/ring_buffer.c                         |  2 +-
 kernel/trace/trace.h                               |  2 +-
 kernel/trace/trace_stack.c                         |  2 +-
 kernel/user_namespace.c                            |  2 +-
 lib/assoc_array.c                                  | 20 ++++++------
 lib/dynamic_queue_limits.c                         |  2 +-
 lib/llist.c                                        |  2 +-
 lib/vsprintf.c                                     |  4 +--
 mm/huge_memory.c                                   |  2 +-
 net/core/dev.c                                     |  2 +-
 net/core/pktgen.c                                  |  2 +-
 net/ipv4/inet_fragment.c                           |  2 +-
 net/ipv4/route.c                                   |  2 +-
 net/ipv4/tcp_output.c                              |  2 +-
 net/ipv4/udp.c                                     |  4 +--
 net/ipv6/ip6_tunnel.c                              |  8 ++---
 net/ipv6/udp.c                                     |  4 +--
 net/llc/llc_input.c                                |  4 +--
 net/mac80211/sta_info.c                            |  2 +-
 net/netlabel/netlabel_calipso.c                    |  2 +-
 net/wireless/nl80211.c                             |  2 +-
 sound/firewire/amdtp-am824.c                       |  6 ++--
 sound/firewire/amdtp-stream.c                      | 23 +++++++-------
 sound/firewire/amdtp-stream.h                      |  2 +-
 sound/firewire/digi00x/amdtp-dot.c                 |  6 ++--
 sound/firewire/fireface/amdtp-ff.c                 |  4 +--
 sound/firewire/fireface/ff-midi.c                  | 10 +++---
 sound/firewire/fireface/ff-transaction.c           |  8 ++---
 sound/firewire/isight.c                            | 18 +++++------
 sound/firewire/motu/amdtp-motu.c                   |  4 +--
 sound/firewire/oxfw/oxfw-scs1x.c                   | 12 ++++----
 sound/firewire/tascam/amdtp-tascam.c               |  4 +--
 sound/firewire/tascam/tascam-transaction.c         |  6 ++--
 sound/soc/xtensa/xtfpga-i2s.c                      |  6 ++--
 sound/usb/bcd2000/bcd2000.c                        |  4 +--
 tools/arch/x86/include/asm/atomic.h                |  2 +-
 tools/include/asm-generic/atomic-gcc.h             |  2 +-
 tools/perf/util/auxtrace.h                         |  4 +--
 tools/perf/util/session.h                          |  2 +-
 virt/kvm/kvm_main.c                                |  2 +-
 180 files changed, 383 insertions(+), 385 deletions(-)

(limited to 'kernel')

diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c
index f46267153ec2..94cabe73664b 100644
--- a/arch/arc/kernel/smp.c
+++ b/arch/arc/kernel/smp.c
@@ -245,7 +245,7 @@ static void ipi_send_msg_one(int cpu, enum ipi_msg_type msg)
 	 * and read back old value
 	 */
 	do {
-		new = old = ACCESS_ONCE(*ipi_data_ptr);
+		new = old = READ_ONCE(*ipi_data_ptr);
 		new |= 1U << msg;
 	} while (cmpxchg(ipi_data_ptr, old, new) != old);
 
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index daa87212c9a1..77f50ae0aeb4 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -71,7 +71,7 @@ static inline void arch_spin_lock(arch_spinlock_t *lock)
 
 	while (lockval.tickets.next != lockval.tickets.owner) {
 		wfe();
-		lockval.tickets.owner = ACCESS_ONCE(lock->tickets.owner);
+		lockval.tickets.owner = READ_ONCE(lock->tickets.owner);
 	}
 
 	smp_mb();
diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index 76e4c83cd5c8..3f24addd7972 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -179,7 +179,7 @@ static int tegra20_idle_lp2_coupled(struct cpuidle_device *dev,
 	bool entered_lp2 = false;
 
 	if (tegra_pending_sgi())
-		ACCESS_ONCE(abort_flag) = true;
+		WRITE_ONCE(abort_flag, true);
 
 	cpuidle_coupled_parallel_barrier(dev, &abort_barrier);
 
diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c
index 79214d5ff097..a9dd619c6c29 100644
--- a/arch/arm/vdso/vgettimeofday.c
+++ b/arch/arm/vdso/vgettimeofday.c
@@ -35,7 +35,7 @@ static notrace u32 __vdso_read_begin(const struct vdso_data *vdata)
 {
 	u32 seq;
 repeat:
-	seq = ACCESS_ONCE(vdata->seq_count);
+	seq = READ_ONCE(vdata->seq_count);
 	if (seq & 1) {
 		cpu_relax();
 		goto repeat;
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 35b31884863b..e98775be112d 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -61,7 +61,7 @@ static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
 
 static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
 {
-	int tmp = ACCESS_ONCE(lock->lock);
+	int tmp = READ_ONCE(lock->lock);
 
 	if (!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK))
 		return ia64_cmpxchg(acq, &lock->lock, tmp, tmp + 1, sizeof (tmp)) == tmp;
@@ -73,19 +73,19 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
 	unsigned short	*p = (unsigned short *)&lock->lock + 1, tmp;
 
 	asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p));
-	ACCESS_ONCE(*p) = (tmp + 2) & ~1;
+	WRITE_ONCE(*p, (tmp + 2) & ~1);
 }
 
 static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
-	long tmp = ACCESS_ONCE(lock->lock);
+	long tmp = READ_ONCE(lock->lock);
 
 	return !!(((tmp >> TICKET_SHIFT) ^ tmp) & TICKET_MASK);
 }
 
 static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
 {
-	long tmp = ACCESS_ONCE(lock->lock);
+	long tmp = READ_ONCE(lock->lock);
 
 	return ((tmp - (tmp >> TICKET_SHIFT)) & TICKET_MASK) > 1;
 }
diff --git a/arch/mips/include/asm/vdso.h b/arch/mips/include/asm/vdso.h
index b7cd6cf77b83..91bf0c2c265c 100644
--- a/arch/mips/include/asm/vdso.h
+++ b/arch/mips/include/asm/vdso.h
@@ -99,7 +99,7 @@ static inline u32 vdso_data_read_begin(const union mips_vdso_data *data)
 	u32 seq;
 
 	while (true) {
-		seq = ACCESS_ONCE(data->seq_count);
+		seq = READ_ONCE(data->seq_count);
 		if (likely(!(seq & 1))) {
 			/* Paired with smp_wmb() in vdso_data_write_*(). */
 			smp_rmb();
diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c
index 4655017f2377..1d2996cd58da 100644
--- a/arch/mips/kernel/pm-cps.c
+++ b/arch/mips/kernel/pm-cps.c
@@ -166,7 +166,7 @@ int cps_pm_enter_state(enum cps_pm_state state)
 	nc_core_ready_count = nc_addr;
 
 	/* Ensure ready_count is zero-initialised before the assembly runs */
-	ACCESS_ONCE(*nc_core_ready_count) = 0;
+	WRITE_ONCE(*nc_core_ready_count, 0);
 	coupled_barrier(&per_cpu(pm_barrier, core), online);
 
 	/* Run the generated entry code */
diff --git a/arch/mn10300/kernel/mn10300-serial.c b/arch/mn10300/kernel/mn10300-serial.c
index 7ecf69879e2d..d7ef1232a82a 100644
--- a/arch/mn10300/kernel/mn10300-serial.c
+++ b/arch/mn10300/kernel/mn10300-serial.c
@@ -543,7 +543,7 @@ static void mn10300_serial_receive_interrupt(struct mn10300_serial_port *port)
 
 try_again:
 	/* pull chars out of the hat */
-	ix = ACCESS_ONCE(port->rx_outp);
+	ix = READ_ONCE(port->rx_outp);
 	if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0) {
 		if (push && !tport->low_latency)
 			tty_flip_buffer_push(tport);
@@ -1724,7 +1724,7 @@ static int mn10300_serial_poll_get_char(struct uart_port *_port)
 	if (mn10300_serial_int_tbl[port->rx_irq].port != NULL) {
 		do {
 			/* pull chars out of the hat */
-			ix = ACCESS_ONCE(port->rx_outp);
+			ix = READ_ONCE(port->rx_outp);
 			if (CIRC_CNT(port->rx_inp, ix, MNSC_BUFFER_SIZE) == 0)
 				return NO_POLL_CHAR;
 
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 17b98a87e5e2..c57d4e8307f2 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -260,7 +260,7 @@ atomic64_set(atomic64_t *v, s64 i)
 static __inline__ s64
 atomic64_read(const atomic64_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 #define atomic64_inc(v)		(atomic64_add(   1,(v)))
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index 7a9cde0cfbd1..acd3206dfae3 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -43,7 +43,7 @@ ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 	if (!opal_memcons)
 		return -ENODEV;
 
-	out_pos = be32_to_cpu(ACCESS_ONCE(opal_memcons->out_pos));
+	out_pos = be32_to_cpu(READ_ONCE(opal_memcons->out_pos));
 
 	/* Now we've read out_pos, put a barrier in before reading the new
 	 * data it points to in conbuf. */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 9fa855f91e55..66f4160010ef 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -117,14 +117,14 @@ extern int _raw_write_trylock_retry(arch_rwlock_t *lp);
 
 static inline int arch_read_trylock_once(arch_rwlock_t *rw)
 {
-	int old = ACCESS_ONCE(rw->lock);
+	int old = READ_ONCE(rw->lock);
 	return likely(old >= 0 &&
 		      __atomic_cmpxchg_bool(&rw->lock, old, old + 1));
 }
 
 static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 {
-	int old = ACCESS_ONCE(rw->lock);
+	int old = READ_ONCE(rw->lock);
 	return likely(old == 0 &&
 		      __atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000));
 }
@@ -211,7 +211,7 @@ static inline void arch_read_unlock(arch_rwlock_t *rw)
 	int old;
 
 	do {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 	} while (!__atomic_cmpxchg_bool(&rw->lock, old, old - 1));
 }
 
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index b12663d653d8..34e30b9ea234 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -162,8 +162,8 @@ void _raw_read_lock_wait(arch_rwlock_t *rw)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		if (old < 0)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
@@ -178,7 +178,7 @@ int _raw_read_trylock_retry(arch_rwlock_t *rw)
 	int old;
 
 	while (count-- > 0) {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 		if (old < 0)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, old, old + 1))
@@ -202,8 +202,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, int prev)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		smp_mb();
 		if (old >= 0) {
 			prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR);
@@ -230,8 +230,8 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 				smp_yield_cpu(~owner);
 			count = spin_retry;
 		}
-		old = ACCESS_ONCE(rw->lock);
-		owner = ACCESS_ONCE(rw->owner);
+		old = READ_ONCE(rw->lock);
+		owner = READ_ONCE(rw->owner);
 		if (old >= 0 &&
 		    __atomic_cmpxchg_bool(&rw->lock, old, old | 0x80000000))
 			prev = old;
@@ -251,7 +251,7 @@ int _raw_write_trylock_retry(arch_rwlock_t *rw)
 	int old;
 
 	while (count-- > 0) {
-		old = ACCESS_ONCE(rw->lock);
+		old = READ_ONCE(rw->lock);
 		if (old)
 			continue;
 		if (__atomic_cmpxchg_bool(&rw->lock, 0, 0x80000000))
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 7643e979e333..e2f398e9456c 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -31,7 +31,7 @@ void atomic_set(atomic_t *, int);
 
 #define atomic_set_release(v, i)	atomic_set((v), (i))
 
-#define atomic_read(v)          ACCESS_ONCE((v)->counter)
+#define atomic_read(v)          READ_ONCE((v)->counter)
 
 #define atomic_add(i, v)	((void)atomic_add_return( (int)(i), (v)))
 #define atomic_sub(i, v)	((void)atomic_add_return(-(int)(i), (v)))
diff --git a/arch/tile/gxio/dma_queue.c b/arch/tile/gxio/dma_queue.c
index baa60357f8ba..b7ba577d82ca 100644
--- a/arch/tile/gxio/dma_queue.c
+++ b/arch/tile/gxio/dma_queue.c
@@ -163,14 +163,14 @@ int __gxio_dma_queue_is_complete(__gxio_dma_queue_t *dma_queue,
 				 int64_t completion_slot, int update)
 {
 	if (update) {
-		if (ACCESS_ONCE(dma_queue->hw_complete_count) >
+		if (READ_ONCE(dma_queue->hw_complete_count) >
 		    completion_slot)
 			return 1;
 
 		__gxio_dma_queue_update_credits(dma_queue);
 	}
 
-	return ACCESS_ONCE(dma_queue->hw_complete_count) > completion_slot;
+	return READ_ONCE(dma_queue->hw_complete_count) > completion_slot;
 }
 
 EXPORT_SYMBOL_GPL(__gxio_dma_queue_is_complete);
diff --git a/arch/tile/include/gxio/dma_queue.h b/arch/tile/include/gxio/dma_queue.h
index b9e45e37649e..c8fd47edba30 100644
--- a/arch/tile/include/gxio/dma_queue.h
+++ b/arch/tile/include/gxio/dma_queue.h
@@ -121,7 +121,7 @@ static inline int64_t __gxio_dma_queue_reserve(__gxio_dma_queue_t *dma_queue,
 		 * if the result is LESS than "hw_complete_count".
 		 */
 		uint64_t complete;
-		complete = ACCESS_ONCE(dma_queue->hw_complete_count);
+		complete = READ_ONCE(dma_queue->hw_complete_count);
 		slot |= (complete & 0xffffffffff000000);
 		if (slot < complete)
 			slot += 0x1000000;
diff --git a/arch/tile/kernel/ptrace.c b/arch/tile/kernel/ptrace.c
index e1a078e6828e..d516d61751c2 100644
--- a/arch/tile/kernel/ptrace.c
+++ b/arch/tile/kernel/ptrace.c
@@ -255,7 +255,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
 int do_syscall_trace_enter(struct pt_regs *regs)
 {
-	u32 work = ACCESS_ONCE(current_thread_info()->flags);
+	u32 work = READ_ONCE(current_thread_info()->flags);
 
 	if ((work & _TIF_SYSCALL_TRACE) &&
 	    tracehook_report_syscall_entry(regs)) {
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 03505ffbe1b6..eaa0ba66cf96 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -75,7 +75,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
 	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 		BUG_ON(regs != task_pt_regs(current));
 
-	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
+	work = READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
 
 	if (unlikely(work & _TIF_SYSCALL_EMU))
 		emulated = true;
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index fa8dbfcf7ed3..11b13c4b43d5 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -318,7 +318,7 @@ int gettimeofday(struct timeval *, struct timezone *)
 notrace time_t __vdso_time(time_t *t)
 {
 	/* This is atomic on x86 so we don't need any locks. */
-	time_t result = ACCESS_ONCE(gtod->wall_time_sec);
+	time_t result = READ_ONCE(gtod->wall_time_sec);
 
 	if (t)
 		*t = result;
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 589af1eec7c1..140d33288e78 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2118,7 +2118,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 			event->destroy(event);
 	}
 
-	if (ACCESS_ONCE(x86_pmu.attr_rdpmc))
+	if (READ_ONCE(x86_pmu.attr_rdpmc))
 		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
 
 	return err;
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 022e59714562..53dd162576a8 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -48,7 +48,7 @@ static inline unsigned gtod_read_begin(const struct vsyscall_gtod_data *s)
 	unsigned ret;
 
 repeat:
-	ret = ACCESS_ONCE(s->seq);
+	ret = READ_ONCE(s->seq);
 	if (unlikely(ret & 1)) {
 		cpu_relax();
 		goto repeat;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 9c4e7ba6870c..7d7715dde901 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -155,14 +155,14 @@ void init_espfix_ap(int cpu)
 	page = cpu/ESPFIX_STACKS_PER_PAGE;
 
 	/* Did another CPU already set this up? */
-	stack_page = ACCESS_ONCE(espfix_pages[page]);
+	stack_page = READ_ONCE(espfix_pages[page]);
 	if (likely(stack_page))
 		goto done;
 
 	mutex_lock(&espfix_init_mutex);
 
 	/* Did we race on the lock? */
-	stack_page = ACCESS_ONCE(espfix_pages[page]);
+	stack_page = READ_ONCE(espfix_pages[page]);
 	if (stack_page)
 		goto unlock_done;
 
@@ -200,7 +200,7 @@ void init_espfix_ap(int cpu)
 		set_pte(&pte_p[n*PTE_STRIDE], pte);
 
 	/* Job is done for this CPU and any CPU which shares this page */
-	ACCESS_ONCE(espfix_pages[page]) = stack_page;
+	WRITE_ONCE(espfix_pages[page], stack_page);
 
 unlock_done:
 	mutex_unlock(&espfix_init_mutex);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 35aafc95e4b8..18bc9b51ac9b 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -105,7 +105,7 @@ static void nmi_max_handler(struct irq_work *w)
 {
 	struct nmiaction *a = container_of(w, struct nmiaction, irq_work);
 	int remainder_ns, decimal_msecs;
-	u64 whole_msecs = ACCESS_ONCE(a->max_duration);
+	u64 whole_msecs = READ_ONCE(a->max_duration);
 
 	remainder_ns = do_div(whole_msecs, (1000 * 1000));
 	decimal_msecs = remainder_ns / 1000;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a69cf053711..a119b361b8b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -443,7 +443,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 
 static u64 __get_spte_lockless(u64 *sptep)
 {
-	return ACCESS_ONCE(*sptep);
+	return READ_ONCE(*sptep);
 }
 #else
 union split_spte {
@@ -4819,7 +4819,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 * If we don't have indirect shadow pages, it means no page is
 	 * write-protected, so we can exit simply.
 	 */
-	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
 		return;
 
 	remote_flush = local_flush = false;
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index ea67dc876316..01c1371f39f8 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -157,7 +157,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
 		return false;
 
 	index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL);
-	return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
+	return !!READ_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
 void kvm_page_track_cleanup(struct kvm *kvm)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 6083ba462f35..13b4f19b9131 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -547,7 +547,7 @@ int xen_alloc_p2m_entry(unsigned long pfn)
 	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
 		topidx = p2m_top_index(pfn);
 		top_mfn_p = &p2m_top_mfn[topidx];
-		mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
+		mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]);
 
 		BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
 
diff --git a/arch/xtensa/platforms/xtfpga/lcd.c b/arch/xtensa/platforms/xtfpga/lcd.c
index 4dc0c1b43f4b..2f7eb66c23ec 100644
--- a/arch/xtensa/platforms/xtfpga/lcd.c
+++ b/arch/xtensa/platforms/xtfpga/lcd.c
@@ -34,23 +34,23 @@
 static void lcd_put_byte(u8 *addr, u8 data)
 {
 #ifdef CONFIG_XTFPGA_LCD_8BIT_ACCESS
-	ACCESS_ONCE(*addr) = data;
+	WRITE_ONCE(*addr, data);
 #else
-	ACCESS_ONCE(*addr) = data & 0xf0;
-	ACCESS_ONCE(*addr) = (data << 4) & 0xf0;
+	WRITE_ONCE(*addr, data & 0xf0);
+	WRITE_ONCE(*addr, (data << 4) & 0xf0);
 #endif
 }
 
 static int __init lcd_init(void)
 {
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	mdelay(5);
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	udelay(200);
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE8BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE8BIT);
 	udelay(50);
 #ifndef CONFIG_XTFPGA_LCD_8BIT_ACCESS
-	ACCESS_ONCE(*LCD_INSTR_ADDR) = LCD_DISPLAY_MODE4BIT;
+	WRITE_ONCE(*LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT);
 	udelay(50);
 	lcd_put_byte(LCD_INSTR_ADDR, LCD_DISPLAY_MODE4BIT);
 	udelay(50);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 6a9a0f03a67b..d822530e6aea 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -261,7 +261,7 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
 
 static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
 {
-	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+	u64 now, issue = READ_ONCE(rwb->sync_issue);
 
 	if (!issue || !rwb->sync_cookie)
 		return 0;
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 12ebd055724c..4b8ba2a75a4d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -668,7 +668,7 @@ const char *dev_driver_string(const struct device *dev)
 	 * so be careful about accessing it.  dev->bus and dev->class should
 	 * never change once they are set, so they don't need special care.
 	 */
-	drv = ACCESS_ONCE(dev->driver);
+	drv = READ_ONCE(dev->driver);
 	return drv ? drv->name :
 			(dev->bus ? dev->bus->name :
 			(dev->class ? dev->class->name : ""));
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 7bcf80fa9ada..41d7c2b99f69 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -134,11 +134,11 @@ unsigned long pm_runtime_autosuspend_expiration(struct device *dev)
 	if (!dev->power.use_autosuspend)
 		goto out;
 
-	autosuspend_delay = ACCESS_ONCE(dev->power.autosuspend_delay);
+	autosuspend_delay = READ_ONCE(dev->power.autosuspend_delay);
 	if (autosuspend_delay < 0)
 		goto out;
 
-	last_busy = ACCESS_ONCE(dev->power.last_busy);
+	last_busy = READ_ONCE(dev->power.last_busy);
 	elapsed = jiffies - last_busy;
 	if (elapsed < 0)
 		goto out;	/* jiffies has wrapped around. */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 8ad92707e45f..6c7ccac2679e 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -641,7 +641,7 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits)
 		return;
 
 retry:
-	entropy_count = orig = ACCESS_ONCE(r->entropy_count);
+	entropy_count = orig = READ_ONCE(r->entropy_count);
 	if (nfrac < 0) {
 		/* Debit */
 		entropy_count += nfrac;
@@ -1265,7 +1265,7 @@ static size_t account(struct entropy_store *r, size_t nbytes, int min,
 
 	/* Can we pull enough? */
 retry:
-	entropy_count = orig = ACCESS_ONCE(r->entropy_count);
+	entropy_count = orig = READ_ONCE(r->entropy_count);
 	ibytes = nbytes;
 	/* never pull more than available */
 	have_bytes = entropy_count >> (ENTROPY_SHIFT + 3);
diff --git a/drivers/clocksource/bcm2835_timer.c b/drivers/clocksource/bcm2835_timer.c
index 39e489a96ad7..60da2537bef9 100644
--- a/drivers/clocksource/bcm2835_timer.c
+++ b/drivers/clocksource/bcm2835_timer.c
@@ -71,7 +71,7 @@ static irqreturn_t bcm2835_time_interrupt(int irq, void *dev_id)
 	if (readl_relaxed(timer->control) & timer->match_mask) {
 		writel_relaxed(timer->match_mask, timer->control);
 
-		event_handler = ACCESS_ONCE(timer->evt.event_handler);
+		event_handler = READ_ONCE(timer->evt.event_handler);
 		if (event_handler)
 			event_handler(&timer->evt);
 		return IRQ_HANDLED;
diff --git a/drivers/crypto/caam/jr.c b/drivers/crypto/caam/jr.c
index d258953ff488..f4f258075b89 100644
--- a/drivers/crypto/caam/jr.c
+++ b/drivers/crypto/caam/jr.c
@@ -172,7 +172,7 @@ static void caam_jr_dequeue(unsigned long devarg)
 
 	while (rd_reg32(&jrp->rregs->outring_used)) {
 
-		head = ACCESS_ONCE(jrp->head);
+		head = READ_ONCE(jrp->head);
 
 		spin_lock(&jrp->outlock);
 
@@ -341,7 +341,7 @@ int caam_jr_enqueue(struct device *dev, u32 *desc,
 	spin_lock_bh(&jrp->inplock);
 
 	head = jrp->head;
-	tail = ACCESS_ONCE(jrp->tail);
+	tail = READ_ONCE(jrp->tail);
 
 	if (!rd_reg32(&jrp->rregs->inpring_avail) ||
 	    CIRC_SPACE(head, tail, JOBR_DEPTH) <= 0) {
diff --git a/drivers/crypto/nx/nx-842-powernv.c b/drivers/crypto/nx/nx-842-powernv.c
index 874ddf5e9087..0f20f5ec9617 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -193,7 +193,7 @@ static int wait_for_csb(struct nx842_workmem *wmem,
 	ktime_t start = wmem->start, now = ktime_get();
 	ktime_t timeout = ktime_add_ms(start, CSB_WAIT_MAX);
 
-	while (!(ACCESS_ONCE(csb->flags) & CSB_V)) {
+	while (!(READ_ONCE(csb->flags) & CSB_V)) {
 		cpu_relax();
 		now = ktime_get();
 		if (ktime_after(now, timeout))
diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c
index 8bf89267dc25..ccf52368a073 100644
--- a/drivers/firewire/ohci.c
+++ b/drivers/firewire/ohci.c
@@ -734,7 +734,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 	__le16 res_count, next_res_count;
 
 	i = ar_first_buffer_index(ctx);
-	res_count = ACCESS_ONCE(ctx->descriptors[i].res_count);
+	res_count = READ_ONCE(ctx->descriptors[i].res_count);
 
 	/* A buffer that is not yet completely filled must be the last one. */
 	while (i != last && res_count == 0) {
@@ -742,8 +742,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 		/* Peek at the next descriptor. */
 		next_i = ar_next_buffer_index(i);
 		rmb(); /* read descriptors in order */
-		next_res_count = ACCESS_ONCE(
-				ctx->descriptors[next_i].res_count);
+		next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count);
 		/*
 		 * If the next descriptor is still empty, we must stop at this
 		 * descriptor.
@@ -759,8 +758,7 @@ static unsigned int ar_search_last_active_buffer(struct ar_context *ctx,
 			if (MAX_AR_PACKET_SIZE > PAGE_SIZE && i != last) {
 				next_i = ar_next_buffer_index(next_i);
 				rmb();
-				next_res_count = ACCESS_ONCE(
-					ctx->descriptors[next_i].res_count);
+				next_res_count = READ_ONCE(ctx->descriptors[next_i].res_count);
 				if (next_res_count != cpu_to_le16(PAGE_SIZE))
 					goto next_buffer_is_active;
 			}
@@ -2812,7 +2810,7 @@ static int handle_ir_buffer_fill(struct context *context,
 	u32 buffer_dma;
 
 	req_count = le16_to_cpu(last->req_count);
-	res_count = le16_to_cpu(ACCESS_ONCE(last->res_count));
+	res_count = le16_to_cpu(READ_ONCE(last->res_count));
 	completed = req_count - res_count;
 	buffer_dma = le32_to_cpu(last->data_address);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 333bad749067..303b5e099a98 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -260,7 +260,7 @@ static void amdgpu_fence_fallback(unsigned long arg)
  */
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
 {
-	uint64_t seq = ACCESS_ONCE(ring->fence_drv.sync_seq);
+	uint64_t seq = READ_ONCE(ring->fence_drv.sync_seq);
 	struct dma_fence *fence, **ptr;
 	int r;
 
@@ -300,7 +300,7 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
 	amdgpu_fence_process(ring);
 	emitted = 0x100000000ull;
 	emitted -= atomic_read(&ring->fence_drv.last_seq);
-	emitted += ACCESS_ONCE(ring->fence_drv.sync_seq);
+	emitted += READ_ONCE(ring->fence_drv.sync_seq);
 	return lower_32_bits(emitted);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 7171968f261e..6149a47fe63d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -788,11 +788,11 @@ static int amdgpu_debugfs_gem_bo_info(int id, void *ptr, void *data)
 	seq_printf(m, "\t0x%08x: %12ld byte %s",
 		   id, amdgpu_bo_size(bo), placement);
 
-	offset = ACCESS_ONCE(bo->tbo.mem.start);
+	offset = READ_ONCE(bo->tbo.mem.start);
 	if (offset != AMDGPU_BO_INVALID_OFFSET)
 		seq_printf(m, " @ 0x%010Lx", offset);
 
-	pin_count = ACCESS_ONCE(bo->pin_count);
+	pin_count = READ_ONCE(bo->pin_count);
 	if (pin_count)
 		seq_printf(m, " pin count %d", pin_count);
 	seq_printf(m, "\n");
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 38cea6fb25a8..a25f6c72f219 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -187,7 +187,7 @@ static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)
 	if (kfifo_is_empty(&entity->job_queue))
 		return false;
 
-	if (ACCESS_ONCE(entity->dependency))
+	if (READ_ONCE(entity->dependency))
 		return false;
 
 	return true;
diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
index 3386452bd2f0..cf3deb283da5 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -451,7 +451,7 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data,
 	else
 		r = 0;
 
-	cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type);
+	cur_placement = READ_ONCE(robj->tbo.mem.mem_type);
 	args->domain = radeon_mem_type_to_domain(cur_placement);
 	drm_gem_object_put_unlocked(gobj);
 	return r;
@@ -481,7 +481,7 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data,
 		r = ret;
 
 	/* Flush HDP cache via MMIO if necessary */
-	cur_placement = ACCESS_ONCE(robj->tbo.mem.mem_type);
+	cur_placement = READ_ONCE(robj->tbo.mem.mem_type);
 	if (rdev->asic->mmio_hdp_flush &&
 	    radeon_mem_type_to_domain(cur_placement) == RADEON_GEM_DOMAIN_VRAM)
 		robj->rdev->asic->mmio_hdp_flush(rdev);
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
index a552e4ea5440..6ac094ee8983 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_surface.c
@@ -904,7 +904,7 @@ vmw_surface_handle_reference(struct vmw_private *dev_priv,
 		if (unlikely(drm_is_render_client(file_priv)))
 			require_exist = true;
 
-		if (ACCESS_ONCE(vmw_fpriv(file_priv)->locked_master)) {
+		if (READ_ONCE(vmw_fpriv(file_priv)->locked_master)) {
 			DRM_ERROR("Locked master refused legacy "
 				  "surface reference.\n");
 			return -EACCES;
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index d9a1e9893136..97bea2e1aa6a 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -380,7 +380,7 @@ static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
 		if (sc->flags & SCF_FROZEN) {
 			wait_event_interruptible_timeout(
 				dd->event_queue,
-				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
+				!(READ_ONCE(dd->flags) & HFI1_FROZEN),
 				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
 			if (dd->flags & HFI1_FROZEN)
 				return -ENOLCK;
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 7108a4b5e94c..75e740780285 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1423,14 +1423,14 @@ retry:
 			goto done;
 		}
 		/* copy from receiver cache line and recalculate */
-		sc->alloc_free = ACCESS_ONCE(sc->free);
+		sc->alloc_free = READ_ONCE(sc->free);
 		avail =
 			(unsigned long)sc->credits -
 			(sc->fill - sc->alloc_free);
 		if (blocks > avail) {
 			/* still no room, actively update */
 			sc_release_update(sc);
-			sc->alloc_free = ACCESS_ONCE(sc->free);
+			sc->alloc_free = READ_ONCE(sc->free);
 			trycount++;
 			goto retry;
 		}
@@ -1667,7 +1667,7 @@ void sc_release_update(struct send_context *sc)
 
 	/* call sent buffer callbacks */
 	code = -1;				/* code not yet set */
-	head = ACCESS_ONCE(sc->sr_head);	/* snapshot the head */
+	head = READ_ONCE(sc->sr_head);	/* snapshot the head */
 	tail = sc->sr_tail;
 	while (head != tail) {
 		pbuf = &sc->sr[tail].pbuf;
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index b3291f0fde9a..a7fc664f0d4e 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -363,7 +363,7 @@ static void ruc_loopback(struct rvt_qp *sqp)
 
 again:
 	smp_read_barrier_depends(); /* see post_one_send() */
-	if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
 		goto clr_busy;
 	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 6781bcdb10b3..08346d25441c 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1725,7 +1725,7 @@ retry:
 
 		swhead = sde->descq_head & sde->sdma_mask;
 		/* this code is really bad for cache line trading */
-		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 		cnt = sde->descq_cnt;
 
 		if (swhead < swtail)
@@ -1872,7 +1872,7 @@ retry:
 	if ((status & sde->idle_mask) && !idle_check_done) {
 		u16 swtail;
 
-		swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+		swtail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 		if (swtail != hwhead) {
 			hwhead = (u16)read_sde_csr(sde, SD(HEAD));
 			idle_check_done = 1;
@@ -2222,7 +2222,7 @@ void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde)
 	u16 len;
 
 	head = sde->descq_head & sde->sdma_mask;
-	tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask;
+	tail = READ_ONCE(sde->descq_tail) & sde->sdma_mask;
 	seq_printf(s, SDE_FMT, sde->this_idx,
 		   sde->cpu,
 		   sdma_state_name(sde->state.current_state),
@@ -3305,7 +3305,7 @@ int sdma_ahg_alloc(struct sdma_engine *sde)
 		return -EINVAL;
 	}
 	while (1) {
-		nr = ffz(ACCESS_ONCE(sde->ahg_bits));
+		nr = ffz(READ_ONCE(sde->ahg_bits));
 		if (nr > 31) {
 			trace_hfi1_ahg_allocate(sde, -ENOSPC);
 			return -ENOSPC;
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 107011d8613b..374c59784950 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -445,7 +445,7 @@ static inline u16 sdma_descq_freecnt(struct sdma_engine *sde)
 {
 	return sde->descq_cnt -
 		(sde->descq_tail -
-		 ACCESS_ONCE(sde->descq_head)) - 1;
+		 READ_ONCE(sde->descq_head)) - 1;
 }
 
 static inline u16 sdma_descq_inprocess(struct sdma_engine *sde)
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 0b646173ca22..9a31c585427f 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -80,7 +80,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (iowait_sdma_pending(&priv->s_iowait)) {
@@ -121,7 +121,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* Check if send work queue is empty. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_cur == ACCESS_ONCE(qp->s_head)) {
+		if (qp->s_cur == READ_ONCE(qp->s_head)) {
 			clear_ahg(qp);
 			goto bail;
 		}
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 2ba74fdd6f15..7fec6b984e3e 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -487,7 +487,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (iowait_sdma_pending(&priv->s_iowait)) {
@@ -501,7 +501,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 
 	/* see post_one_send() */
 	smp_read_barrier_depends();
-	if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+	if (qp->s_cur == READ_ONCE(qp->s_head))
 		goto bail;
 
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index c0c0e0445cbf..8ec6e8a8d6f7 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -276,7 +276,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
 		/* Wait until all requests have been freed. */
 		wait_event_interruptible(
 			pq->wait,
-			(ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
+			(READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
 		kfree(pq->reqs);
 		kfree(pq->req_in_use);
 		kmem_cache_destroy(pq->txreq_cache);
@@ -591,7 +591,7 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd,
 			if (ret != -EBUSY) {
 				req->status = ret;
 				WRITE_ONCE(req->has_error, 1);
-				if (ACCESS_ONCE(req->seqcomp) ==
+				if (READ_ONCE(req->seqcomp) ==
 				    req->seqsubmitted - 1)
 					goto free_req;
 				return ret;
@@ -825,7 +825,7 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		 */
 		if (req->data_len) {
 			iovec = &req->iovs[req->iov_idx];
-			if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
+			if (READ_ONCE(iovec->offset) == iovec->iov.iov_len) {
 				if (++req->iov_idx == req->data_iovs) {
 					ret = -EFAULT;
 					goto free_txreq;
@@ -1390,7 +1390,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
 	} else {
 		if (status != SDMA_TXREQ_S_OK)
 			req->status = status;
-		if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
+		if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) &&
 		    (READ_ONCE(req->done) ||
 		     READ_ONCE(req->has_error))) {
 			user_sdma_free_request(req, false);
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
index 53efbb0b40c4..9a37e844d4c8 100644
--- a/drivers/infiniband/hw/qib/qib_ruc.c
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -368,7 +368,7 @@ static void qib_ruc_loopback(struct rvt_qp *sqp)
 
 again:
 	smp_read_barrier_depends(); /* see post_one_send() */
-	if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
+	if (sqp->s_last == READ_ONCE(sqp->s_head))
 		goto clr_busy;
 	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
 
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index 498e2202e72c..bddcc37ace44 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -61,7 +61,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (atomic_read(&priv->s_dma_busy)) {
@@ -91,7 +91,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* Check if send work queue is empty. */
 		smp_read_barrier_depends(); /* see post_one_send() */
-		if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+		if (qp->s_cur == READ_ONCE(qp->s_head))
 			goto bail;
 		/*
 		 * Start a new request.
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index be4907453ac4..15962ed193ce 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -253,7 +253,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 			goto bail;
 		/* We are in the error state, flush the work request. */
 		smp_read_barrier_depends(); /* see post_one_send */
-		if (qp->s_last == ACCESS_ONCE(qp->s_head))
+		if (qp->s_last == READ_ONCE(qp->s_head))
 			goto bail;
 		/* If DMAs are in progress, we can't flush immediately. */
 		if (atomic_read(&priv->s_dma_busy)) {
@@ -267,7 +267,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags)
 
 	/* see post_one_send() */
 	smp_read_barrier_depends();
-	if (qp->s_cur == ACCESS_ONCE(qp->s_head))
+	if (qp->s_cur == READ_ONCE(qp->s_head))
 		goto bail;
 
 	wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 22df09ae809e..b670cb9d2006 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1073,7 +1073,7 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
 	rdi->driver_f.notify_error_qp(qp);
 
 	/* Schedule the sending tasklet to drain the send work queue. */
-	if (ACCESS_ONCE(qp->s_last) != qp->s_head)
+	if (READ_ONCE(qp->s_last) != qp->s_head)
 		rdi->driver_f.schedule_send(qp);
 
 	rvt_clear_mr_refs(qp, 0);
@@ -1686,7 +1686,7 @@ static inline int rvt_qp_is_avail(
 	if (likely(qp->s_avail))
 		return 0;
 	smp_read_barrier_depends(); /* see rc.c */
-	slast = ACCESS_ONCE(qp->s_last);
+	slast = READ_ONCE(qp->s_last);
 	if (qp->s_head >= slast)
 		avail = qp->s_size - (qp->s_head - slast);
 	else
@@ -1917,7 +1917,7 @@ int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	 * ahead and kick the send engine into gear. Otherwise we will always
 	 * just schedule the send to happen later.
 	 */
-	call_send = qp->s_head == ACCESS_ONCE(qp->s_last) && !wr->next;
+	call_send = qp->s_head == READ_ONCE(qp->s_last) && !wr->next;
 
 	for (; wr; wr = wr->next) {
 		err = rvt_post_one_wr(qp, wr, &call_send);
diff --git a/drivers/input/misc/regulator-haptic.c b/drivers/input/misc/regulator-haptic.c
index 2e8f801932be..a1db1e5040dc 100644
--- a/drivers/input/misc/regulator-haptic.c
+++ b/drivers/input/misc/regulator-haptic.c
@@ -233,7 +233,7 @@ static int __maybe_unused regulator_haptic_resume(struct device *dev)
 
 	haptic->suspended = false;
 
-	magnitude = ACCESS_ONCE(haptic->magnitude);
+	magnitude = READ_ONCE(haptic->magnitude);
 	if (magnitude)
 		regulator_haptic_set_voltage(haptic, magnitude);
 
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index d216a8f7bc22..33bb074d6941 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -347,7 +347,7 @@ static void __cache_size_refresh(void)
 	BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 	BUG_ON(dm_bufio_client_count < 0);
 
-	dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size);
+	dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
 
 	/*
 	 * Use default if set to 0 and report the actual cache size used.
@@ -960,7 +960,7 @@ static void __get_memory_limit(struct dm_bufio_client *c,
 {
 	unsigned long buffers;
 
-	if (unlikely(ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
+	if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
 		if (mutex_trylock(&dm_bufio_clients_lock)) {
 			__cache_size_refresh();
 			mutex_unlock(&dm_bufio_clients_lock);
@@ -1600,7 +1600,7 @@ static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
 
 static unsigned long get_retain_buffers(struct dm_bufio_client *c)
 {
-        unsigned long retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes);
+        unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
         return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
 }
 
@@ -1647,7 +1647,7 @@ dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
 
-	return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]);
+	return READ_ONCE(c->n_buffers[LIST_CLEAN]) + READ_ONCE(c->n_buffers[LIST_DIRTY]);
 }
 
 /*
@@ -1818,7 +1818,7 @@ EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
 
 static unsigned get_max_age_hz(void)
 {
-	unsigned max_age = ACCESS_ONCE(dm_bufio_max_age);
+	unsigned max_age = READ_ONCE(dm_bufio_max_age);
 
 	if (max_age > UINT_MAX / HZ)
 		max_age = UINT_MAX / HZ;
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index cf2c67e35eaf..eb45cc3df31d 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -107,7 +107,7 @@ static void io_job_start(struct dm_kcopyd_throttle *t)
 try_again:
 	spin_lock_irq(&throttle_spinlock);
 
-	throttle = ACCESS_ONCE(t->throttle);
+	throttle = READ_ONCE(t->throttle);
 
 	if (likely(throttle >= 100))
 		goto skip_limit;
@@ -157,7 +157,7 @@ static void io_job_finish(struct dm_kcopyd_throttle *t)
 
 	t->num_io_jobs--;
 
-	if (likely(ACCESS_ONCE(t->throttle) >= 100))
+	if (likely(READ_ONCE(t->throttle) >= 100))
 		goto skip_limit;
 
 	if (!t->num_io_jobs) {
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 6028d8247f58..a1a5eec783cc 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -431,7 +431,7 @@ do_sync_free:
 		synchronize_rcu_expedited();
 		dm_stat_free(&s->rcu_head);
 	} else {
-		ACCESS_ONCE(dm_stat_need_rcu_barrier) = 1;
+		WRITE_ONCE(dm_stat_need_rcu_barrier, 1);
 		call_rcu(&s->rcu_head, dm_stat_free);
 	}
 	return 0;
@@ -639,12 +639,12 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 		 */
 		last = raw_cpu_ptr(stats->last);
 		stats_aux->merged =
-			(bi_sector == (ACCESS_ONCE(last->last_sector) &&
+			(bi_sector == (READ_ONCE(last->last_sector) &&
 				       ((bi_rw == WRITE) ==
-					(ACCESS_ONCE(last->last_rw) == WRITE))
+					(READ_ONCE(last->last_rw) == WRITE))
 				       ));
-		ACCESS_ONCE(last->last_sector) = end_sector;
-		ACCESS_ONCE(last->last_rw) = bi_rw;
+		WRITE_ONCE(last->last_sector, end_sector);
+		WRITE_ONCE(last->last_rw, bi_rw);
 	}
 
 	rcu_read_lock();
@@ -693,22 +693,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
 	for_each_possible_cpu(cpu) {
 		p = &s->stat_percpu[cpu][x];
-		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
-		shared->tmp.sectors[WRITE] += ACCESS_ONCE(p->sectors[WRITE]);
-		shared->tmp.ios[READ] += ACCESS_ONCE(p->ios[READ]);
-		shared->tmp.ios[WRITE] += ACCESS_ONCE(p->ios[WRITE]);
-		shared->tmp.merges[READ] += ACCESS_ONCE(p->merges[READ]);
-		shared->tmp.merges[WRITE] += ACCESS_ONCE(p->merges[WRITE]);
-		shared->tmp.ticks[READ] += ACCESS_ONCE(p->ticks[READ]);
-		shared->tmp.ticks[WRITE] += ACCESS_ONCE(p->ticks[WRITE]);
-		shared->tmp.io_ticks[READ] += ACCESS_ONCE(p->io_ticks[READ]);
-		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
-		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
-		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+		shared->tmp.sectors[READ] += READ_ONCE(p->sectors[READ]);
+		shared->tmp.sectors[WRITE] += READ_ONCE(p->sectors[WRITE]);
+		shared->tmp.ios[READ] += READ_ONCE(p->ios[READ]);
+		shared->tmp.ios[WRITE] += READ_ONCE(p->ios[WRITE]);
+		shared->tmp.merges[READ] += READ_ONCE(p->merges[READ]);
+		shared->tmp.merges[WRITE] += READ_ONCE(p->merges[WRITE]);
+		shared->tmp.ticks[READ] += READ_ONCE(p->ticks[READ]);
+		shared->tmp.ticks[WRITE] += READ_ONCE(p->ticks[WRITE]);
+		shared->tmp.io_ticks[READ] += READ_ONCE(p->io_ticks[READ]);
+		shared->tmp.io_ticks[WRITE] += READ_ONCE(p->io_ticks[WRITE]);
+		shared->tmp.io_ticks_total += READ_ONCE(p->io_ticks_total);
+		shared->tmp.time_in_queue += READ_ONCE(p->time_in_queue);
 		if (s->n_histogram_entries) {
 			unsigned i;
 			for (i = 0; i < s->n_histogram_entries + 1; i++)
-				shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+				shared->tmp.histogram[i] += READ_ONCE(p->histogram[i]);
 		}
 	}
 }
diff --git a/drivers/md/dm-switch.c b/drivers/md/dm-switch.c
index 4c8de1ff78ca..8d0ba879777e 100644
--- a/drivers/md/dm-switch.c
+++ b/drivers/md/dm-switch.c
@@ -144,7 +144,7 @@ static unsigned switch_region_table_read(struct switch_ctx *sctx, unsigned long
 
 	switch_get_position(sctx, region_nr, &region_index, &bit);
 
-	return (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+	return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
 		((1 << sctx->region_table_entry_bits) - 1);
 }
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1e25705209c2..89e5dff9b4cf 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2431,7 +2431,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 	struct pool_c *pt = pool->ti->private;
 	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
 	enum pool_mode old_mode = get_pool_mode(pool);
-	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
+	unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
 
 	/*
 	 * Never allow the pool to transition to PM_WRITE mode if user
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index bda3caca23ca..fba93237a780 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -589,7 +589,7 @@ static void verity_prefetch_io(struct work_struct *work)
 		verity_hash_at_level(v, pw->block, i, &hash_block_start, NULL);
 		verity_hash_at_level(v, pw->block + pw->n_blocks - 1, i, &hash_block_end, NULL);
 		if (!i) {
-			unsigned cluster = ACCESS_ONCE(dm_verity_prefetch_cluster);
+			unsigned cluster = READ_ONCE(dm_verity_prefetch_cluster);
 
 			cluster >>= v->data_dev_block_bits;
 			if (unlikely(!cluster))
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4be85324f44d..8aaffa19b29a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -114,7 +114,7 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 
 static int __dm_get_module_param_int(int *module_param, int min, int max)
 {
-	int param = ACCESS_ONCE(*module_param);
+	int param = READ_ONCE(*module_param);
 	int modified_param = 0;
 	bool modified = true;
 
@@ -136,7 +136,7 @@ static int __dm_get_module_param_int(int *module_param, int min, int max)
 unsigned __dm_get_module_param(unsigned *module_param,
 			       unsigned def, unsigned max)
 {
-	unsigned param = ACCESS_ONCE(*module_param);
+	unsigned param = READ_ONCE(*module_param);
 	unsigned modified_param = 0;
 
 	if (!param)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0ff1bbf6c90e..447ddcbc9566 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2651,7 +2651,7 @@ state_show(struct md_rdev *rdev, char *page)
 {
 	char *sep = ",";
 	size_t len = 0;
-	unsigned long flags = ACCESS_ONCE(rdev->flags);
+	unsigned long flags = READ_ONCE(rdev->flags);
 
 	if (test_bit(Faulty, &flags) ||
 	    (!test_bit(ExternalBbl, &flags) &&
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 928e24a07133..7d9a50eed9db 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6072,7 +6072,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	 */
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks; i++) {
-		struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev);
+		struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
 
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags))
 			still_degraded = 1;
diff --git a/drivers/misc/mic/scif/scif_rb.c b/drivers/misc/mic/scif/scif_rb.c
index 637cc4686742..b665757ca89a 100644
--- a/drivers/misc/mic/scif/scif_rb.c
+++ b/drivers/misc/mic/scif/scif_rb.c
@@ -138,7 +138,7 @@ void scif_rb_commit(struct scif_rb *rb)
 	 * the read barrier in scif_rb_count(..)
 	 */
 	wmb();
-	ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset;
+	WRITE_ONCE(*rb->write_ptr, rb->current_write_offset);
 #ifdef CONFIG_INTEL_MIC_CARD
 	/*
 	 * X100 Si bug: For the case where a Core is performing an EXT_WR
@@ -147,7 +147,7 @@ void scif_rb_commit(struct scif_rb *rb)
 	 * This way, if ordering is violated for the Interrupt Message, it will
 	 * fall just behind the first Posted associated with the first EXT_WR.
 	 */
-	ACCESS_ONCE(*rb->write_ptr) = rb->current_write_offset;
+	WRITE_ONCE(*rb->write_ptr, rb->current_write_offset);
 #endif
 }
 
@@ -210,7 +210,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb)
 	 * scif_rb_space(..)
 	 */
 	mb();
-	ACCESS_ONCE(*rb->read_ptr) = new_offset;
+	WRITE_ONCE(*rb->read_ptr, new_offset);
 #ifdef CONFIG_INTEL_MIC_CARD
 	/*
 	 * X100 Si Bug: For the case where a Core is performing an EXT_WR
@@ -219,7 +219,7 @@ void scif_rb_update_read_ptr(struct scif_rb *rb)
 	 * This way, if ordering is violated for the Interrupt Message, it will
 	 * fall just behind the first Posted associated with the first EXT_WR.
 	 */
-	ACCESS_ONCE(*rb->read_ptr) = new_offset;
+	WRITE_ONCE(*rb->read_ptr, new_offset);
 #endif
 }
 
diff --git a/drivers/misc/mic/scif/scif_rma_list.c b/drivers/misc/mic/scif/scif_rma_list.c
index e1ef8daedd5a..a036dbb4101e 100644
--- a/drivers/misc/mic/scif/scif_rma_list.c
+++ b/drivers/misc/mic/scif/scif_rma_list.c
@@ -277,7 +277,7 @@ retry:
 		 * Need to restart list traversal if there has been
 		 * an asynchronous list entry deletion.
 		 */
-		if (ACCESS_ONCE(ep->rma_info.async_list_del))
+		if (READ_ONCE(ep->rma_info.async_list_del))
 			goto retry;
 	}
 	mutex_unlock(&ep->rma_info.rma_lock);
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index c02cc817a490..1ed9529e7bd1 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1378,7 +1378,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 				unsigned int count;
 
 				slaves = rcu_dereference(bond->slave_arr);
-				count = slaves ? ACCESS_ONCE(slaves->count) : 0;
+				count = slaves ? READ_ONCE(slaves->count) : 0;
 				if (likely(count))
 					tx_slave = slaves->arr[hash_index %
 							       count];
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c99dc59d729b..af51b90cecbb 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1167,7 +1167,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 	slave = bond_slave_get_rcu(skb->dev);
 	bond = slave->bond;
 
-	recv_probe = ACCESS_ONCE(bond->recv_probe);
+	recv_probe = READ_ONCE(bond->recv_probe);
 	if (recv_probe) {
 		ret = recv_probe(skb, bond, slave);
 		if (ret == RX_HANDLER_CONSUMED) {
@@ -3810,7 +3810,7 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 		else
 			bond_xmit_slave_id(bond, skb, 0);
 	} else {
-		int slave_cnt = ACCESS_ONCE(bond->slave_cnt);
+		int slave_cnt = READ_ONCE(bond->slave_cnt);
 
 		if (likely(slave_cnt)) {
 			slave_id = bond_rr_gen_slave_id(bond);
@@ -3972,7 +3972,7 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 	unsigned int count;
 
 	slaves = rcu_dereference(bond->slave_arr);
-	count = slaves ? ACCESS_ONCE(slaves->count) : 0;
+	count = slaves ? READ_ONCE(slaves->count) : 0;
 	if (likely(count)) {
 		slave = slaves->arr[bond_xmit_hash(bond, skb) % count];
 		bond_dev_queue_xmit(bond, skb, slave->dev);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 4ef68f69b58c..43f52a8fe708 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -405,7 +405,7 @@ void free_tx_desc(struct adapter *adap, struct sge_txq *q,
  */
 static inline int reclaimable(const struct sge_txq *q)
 {
-	int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx));
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
 	hw_cidx -= q->cidx;
 	return hw_cidx < 0 ? hw_cidx + q->size : hw_cidx;
 }
@@ -1375,7 +1375,7 @@ out_free:	dev_kfree_skb_any(skb);
  */
 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
 {
-	int hw_cidx = ntohs(ACCESS_ONCE(q->stat->cidx));
+	int hw_cidx = ntohs(READ_ONCE(q->stat->cidx));
 	int reclaim = hw_cidx - q->cidx;
 
 	if (reclaim < 0)
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 0e3d9f39a807..c6e859a27ee6 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -605,7 +605,7 @@ static void accumulate_16bit_val(u32 *acc, u16 val)
 
 	if (wrapped)
 		newacc += 65536;
-	ACCESS_ONCE(*acc) = newacc;
+	WRITE_ONCE(*acc, newacc);
 }
 
 static void populate_erx_stats(struct be_adapter *adapter,
diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
index 0cec06bec63e..340e28211135 100644
--- a/drivers/net/ethernet/hisilicon/hip04_eth.c
+++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
@@ -373,7 +373,7 @@ static int hip04_tx_reclaim(struct net_device *ndev, bool force)
 	unsigned int count;
 
 	smp_rmb();
-	count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail);
+	count = tx_count(READ_ONCE(priv->tx_head), tx_tail);
 	if (count == 0)
 		goto out;
 
@@ -431,7 +431,7 @@ static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	dma_addr_t phys;
 
 	smp_rmb();
-	count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail));
+	count = tx_count(tx_head, READ_ONCE(priv->tx_tail));
 	if (count == (TX_DESC_NUM - 1)) {
 		netif_stop_queue(ndev);
 		return NETDEV_TX_BUSY;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 8f326f87a815..2cb9539c931e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -264,7 +264,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
 		 vsi->rx_buf_failed, vsi->rx_page_failed);
 	rcu_read_lock();
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
-		struct i40e_ring *rx_ring = ACCESS_ONCE(vsi->rx_rings[i]);
+		struct i40e_ring *rx_ring = READ_ONCE(vsi->rx_rings[i]);
 
 		if (!rx_ring)
 			continue;
@@ -320,7 +320,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid)
 			 ITR_IS_DYNAMIC(rx_ring->rx_itr_setting) ? "dynamic" : "fixed");
 	}
 	for (i = 0; i < vsi->num_queue_pairs; i++) {
-		struct i40e_ring *tx_ring = ACCESS_ONCE(vsi->tx_rings[i]);
+		struct i40e_ring *tx_ring = READ_ONCE(vsi->tx_rings[i]);
 
 		if (!tx_ring)
 			continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 05e89864f781..e9e04a485e0a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1570,7 +1570,7 @@ static void i40e_get_ethtool_stats(struct net_device *netdev,
 	}
 	rcu_read_lock();
 	for (j = 0; j < vsi->num_queue_pairs; j++) {
-		tx_ring = ACCESS_ONCE(vsi->tx_rings[j]);
+		tx_ring = READ_ONCE(vsi->tx_rings[j]);
 
 		if (!tx_ring)
 			continue;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 6498da8806cb..de1fcac7834d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -455,7 +455,7 @@ static void i40e_get_netdev_stats_struct(struct net_device *netdev,
 		u64 bytes, packets;
 		unsigned int start;
 
-		tx_ring = ACCESS_ONCE(vsi->tx_rings[i]);
+		tx_ring = READ_ONCE(vsi->tx_rings[i]);
 		if (!tx_ring)
 			continue;
 		i40e_get_netdev_stats_struct_tx(tx_ring, stats);
@@ -791,7 +791,7 @@ static void i40e_update_vsi_stats(struct i40e_vsi *vsi)
 	rcu_read_lock();
 	for (q = 0; q < vsi->num_queue_pairs; q++) {
 		/* locate Tx ring */
-		p = ACCESS_ONCE(vsi->tx_rings[q]);
+		p = READ_ONCE(vsi->tx_rings[q]);
 
 		do {
 			start = u64_stats_fetch_begin_irq(&p->syncp);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ptp.c b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
index d8456c381c99..97381238eb7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ptp.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ptp.c
@@ -130,7 +130,7 @@ static int i40e_ptp_adjfreq(struct ptp_clock_info *ptp, s32 ppb)
 	}
 
 	smp_mb(); /* Force any pending update before accessing. */
-	adj = ACCESS_ONCE(pf->ptp_base_adj);
+	adj = READ_ONCE(pf->ptp_base_adj);
 
 	freq = adj;
 	freq *= ppb;
@@ -499,7 +499,7 @@ void i40e_ptp_set_increment(struct i40e_pf *pf)
 	wr32(hw, I40E_PRTTSYN_INC_H, incval >> 32);
 
 	/* Update the base adjustement value. */
-	ACCESS_ONCE(pf->ptp_base_adj) = incval;
+	WRITE_ONCE(pf->ptp_base_adj, incval);
 	smp_mb(); /* Force the above update. */
 }
 
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..31a3f09df9f7 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -375,7 +375,7 @@ u32 igb_rd32(struct e1000_hw *hw, u32 reg);
 /* write operations, indexed using DWORDS */
 #define wr32(reg, val) \
 do { \
-	u8 __iomem *hw_addr = ACCESS_ONCE((hw)->hw_addr); \
+	u8 __iomem *hw_addr = READ_ONCE((hw)->hw_addr); \
 	if (!E1000_REMOVED(hw_addr)) \
 		writel((val), &hw_addr[(reg)]); \
 } while (0)
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..6bccc2be2b91 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -750,7 +750,7 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
 u32 igb_rd32(struct e1000_hw *hw, u32 reg)
 {
 	struct igb_adapter *igb = container_of(hw, struct igb_adapter, hw);
-	u8 __iomem *hw_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *hw_addr = READ_ONCE(hw->hw_addr);
 	u32 value = 0;
 
 	if (E1000_REMOVED(hw_addr))
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
index e083732adf64..a01409e2e06c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.h
@@ -161,7 +161,7 @@ static inline bool ixgbe_removed(void __iomem *addr)
 
 static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (ixgbe_removed(reg_addr))
 		return;
@@ -180,7 +180,7 @@ static inline void writeq(u64 val, void __iomem *addr)
 
 static inline void ixgbe_write_reg64(struct ixgbe_hw *hw, u32 reg, u64 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (ixgbe_removed(reg_addr))
 		return;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4d76afd13868..2224e691ee07 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -380,7 +380,7 @@ static void ixgbe_check_remove(struct ixgbe_hw *hw, u32 reg)
  */
 u32 ixgbe_read_reg(struct ixgbe_hw *hw, u32 reg)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 	u32 value;
 
 	if (ixgbe_removed(reg_addr))
@@ -8630,7 +8630,7 @@ static void ixgbe_get_stats64(struct net_device *netdev,
 
 	rcu_read_lock();
 	for (i = 0; i < adapter->num_rx_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->rx_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->rx_ring[i]);
 		u64 bytes, packets;
 		unsigned int start;
 
@@ -8646,12 +8646,12 @@ static void ixgbe_get_stats64(struct net_device *netdev,
 	}
 
 	for (i = 0; i < adapter->num_tx_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->tx_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->tx_ring[i]);
 
 		ixgbe_get_ring_stats64(stats, ring);
 	}
 	for (i = 0; i < adapter->num_xdp_queues; i++) {
-		struct ixgbe_ring *ring = ACCESS_ONCE(adapter->xdp_ring[i]);
+		struct ixgbe_ring *ring = READ_ONCE(adapter->xdp_ring[i]);
 
 		ixgbe_get_ring_stats64(stats, ring);
 	}
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
index 86d6924a2b71..ae312c45696a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ptp.c
@@ -378,7 +378,7 @@ static int ixgbe_ptp_adjfreq_82599(struct ptp_clock_info *ptp, s32 ppb)
 	}
 
 	smp_mb();
-	incval = ACCESS_ONCE(adapter->base_incval);
+	incval = READ_ONCE(adapter->base_incval);
 
 	freq = incval;
 	freq *= ppb;
@@ -1159,7 +1159,7 @@ void ixgbe_ptp_start_cyclecounter(struct ixgbe_adapter *adapter)
 	}
 
 	/* update the base incval used to calculate frequency adjustment */
-	ACCESS_ONCE(adapter->base_incval) = incval;
+	WRITE_ONCE(adapter->base_incval, incval);
 	smp_mb();
 
 	/* need lock to prevent incorrect read while modifying cyclecounter */
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 032f8ac06357..cacb30682434 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -164,7 +164,7 @@ static void ixgbevf_check_remove(struct ixgbe_hw *hw, u32 reg)
 
 u32 ixgbevf_read_reg(struct ixgbe_hw *hw, u32 reg)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 	u32 value;
 
 	if (IXGBE_REMOVED(reg_addr))
diff --git a/drivers/net/ethernet/intel/ixgbevf/vf.h b/drivers/net/ethernet/intel/ixgbevf/vf.h
index 04d8d4ee4f04..c651fefcc3d2 100644
--- a/drivers/net/ethernet/intel/ixgbevf/vf.h
+++ b/drivers/net/ethernet/intel/ixgbevf/vf.h
@@ -182,7 +182,7 @@ struct ixgbevf_info {
 
 static inline void ixgbe_write_reg(struct ixgbe_hw *hw, u32 reg, u32 value)
 {
-	u8 __iomem *reg_addr = ACCESS_ONCE(hw->hw_addr);
+	u8 __iomem *reg_addr = READ_ONCE(hw->hw_addr);
 
 	if (IXGBE_REMOVED(reg_addr))
 		return;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 8a32a8f7f9c0..3541a7f9d12e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -414,8 +414,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
 
 	index = cons_index & size_mask;
 	cqe = mlx4_en_get_cqe(buf, index, priv->cqe_size) + factor;
-	last_nr_txbb = ACCESS_ONCE(ring->last_nr_txbb);
-	ring_cons = ACCESS_ONCE(ring->cons);
+	last_nr_txbb = READ_ONCE(ring->last_nr_txbb);
+	ring_cons = READ_ONCE(ring->cons);
 	ring_index = ring_cons & size_mask;
 	stamp_index = ring_index;
 
@@ -479,8 +479,8 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
 	wmb();
 
 	/* we want to dirty this cache line once */
-	ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
-	ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
+	WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
+	WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
 
 	if (cq->type == TX_XDP)
 		return done < budget;
@@ -858,7 +858,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto tx_drop;
 
 	/* fetch ring->cons far ahead before needing it to avoid stall */
-	ring_cons = ACCESS_ONCE(ring->cons);
+	ring_cons = READ_ONCE(ring->cons);
 
 	real_size = get_real_size(skb, shinfo, dev, &lso_header_size,
 				  &inline_ok, &fragptr);
@@ -1066,7 +1066,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 		 */
 		smp_rmb();
 
-		ring_cons = ACCESS_ONCE(ring->cons);
+		ring_cons = READ_ONCE(ring->cons);
 		if (unlikely(!mlx4_en_is_tx_ring_full(ring))) {
 			netif_tx_wake_queue(ring->tx_queue);
 			ring->wake_queue++;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 50ea69d88480..5dd5f61e1114 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -2629,7 +2629,7 @@ static void vxge_poll_vp_lockup(unsigned long data)
 		ring = &vdev->vpaths[i].ring;
 
 		/* Truncated to machine word size number of frames */
-		rx_frms = ACCESS_ONCE(ring->stats.rx_frms);
+		rx_frms = READ_ONCE(ring->stats.rx_frms);
 
 		/* Did this vpath received any packets */
 		if (ring->stats.prev_rx_frms == rx_frms) {
diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 13f72f5b18d2..a95a46bcd339 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2073,7 +2073,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id)
 	netif_vdbg(efx, intr, efx->net_dev,
 		   "IRQ %d on CPU %d\n", irq, raw_smp_processor_id());
 
-	if (likely(ACCESS_ONCE(efx->irq_soft_enabled))) {
+	if (likely(READ_ONCE(efx->irq_soft_enabled))) {
 		/* Note test interrupts */
 		if (context->index == efx->irq_level)
 			efx->last_irq_cpu = raw_smp_processor_id();
@@ -2088,7 +2088,7 @@ static irqreturn_t efx_ef10_msi_interrupt(int irq, void *dev_id)
 static irqreturn_t efx_ef10_legacy_interrupt(int irq, void *dev_id)
 {
 	struct efx_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	struct efx_channel *channel;
 	efx_dword_t reg;
 	u32 queues;
@@ -3291,7 +3291,7 @@ static int efx_ef10_handle_rx_event(struct efx_channel *channel,
 	bool rx_cont;
 	u16 flags = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	/* Basic packet information */
@@ -3428,7 +3428,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 	unsigned int tx_ev_q_label;
 	int tx_descs = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (unlikely(EFX_QWORD_FIELD(*event, ESF_DZ_TX_DROP_EVENT)))
@@ -5316,7 +5316,7 @@ static void efx_ef10_filter_remove_old(struct efx_nic *efx)
 	int i;
 
 	for (i = 0; i < HUNT_FILTER_TBL_ROWS; i++) {
-		if (ACCESS_ONCE(table->entry[i].spec) &
+		if (READ_ONCE(table->entry[i].spec) &
 		    EFX_EF10_FILTER_FLAG_AUTO_OLD) {
 			rc = efx_ef10_filter_remove_internal(efx,
 					1U << EFX_FILTER_PRI_AUTO, i, true);
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index b9cb697b2818..016616a63880 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -2809,7 +2809,7 @@ static void efx_reset_work(struct work_struct *data)
 	unsigned long pending;
 	enum reset_type method;
 
-	pending = ACCESS_ONCE(efx->reset_pending);
+	pending = READ_ONCE(efx->reset_pending);
 	method = fls(pending) - 1;
 
 	if (method == RESET_TYPE_MC_BIST)
@@ -2874,7 +2874,7 @@ void efx_schedule_reset(struct efx_nic *efx, enum reset_type type)
 	/* If we're not READY then just leave the flags set as the cue
 	 * to abort probing or reschedule the reset later.
 	 */
-	if (ACCESS_ONCE(efx->state) != STATE_READY)
+	if (READ_ONCE(efx->state) != STATE_READY)
 		return;
 
 	/* efx_process_channel() will no longer read events once a
diff --git a/drivers/net/ethernet/sfc/falcon/efx.c b/drivers/net/ethernet/sfc/falcon/efx.c
index 29614da91cbf..7263275fde4a 100644
--- a/drivers/net/ethernet/sfc/falcon/efx.c
+++ b/drivers/net/ethernet/sfc/falcon/efx.c
@@ -2545,7 +2545,7 @@ static void ef4_reset_work(struct work_struct *data)
 	unsigned long pending;
 	enum reset_type method;
 
-	pending = ACCESS_ONCE(efx->reset_pending);
+	pending = READ_ONCE(efx->reset_pending);
 	method = fls(pending) - 1;
 
 	if ((method == RESET_TYPE_RECOVER_OR_DISABLE ||
@@ -2605,7 +2605,7 @@ void ef4_schedule_reset(struct ef4_nic *efx, enum reset_type type)
 	/* If we're not READY then just leave the flags set as the cue
 	 * to abort probing or reschedule the reset later.
 	 */
-	if (ACCESS_ONCE(efx->state) != STATE_READY)
+	if (READ_ONCE(efx->state) != STATE_READY)
 		return;
 
 	queue_work(reset_workqueue, &efx->reset_work);
diff --git a/drivers/net/ethernet/sfc/falcon/falcon.c b/drivers/net/ethernet/sfc/falcon/falcon.c
index 93c713c1f627..cd8bb472d758 100644
--- a/drivers/net/ethernet/sfc/falcon/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon/falcon.c
@@ -452,7 +452,7 @@ static irqreturn_t falcon_legacy_interrupt_a1(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Check to see if we have a serious error condition */
@@ -1372,7 +1372,7 @@ static void falcon_reconfigure_mac_wrapper(struct ef4_nic *efx)
 	ef4_oword_t reg;
 	int link_speed, isolate;
 
-	isolate = !!ACCESS_ONCE(efx->reset_pending);
+	isolate = !!READ_ONCE(efx->reset_pending);
 
 	switch (link_state->speed) {
 	case 10000: link_speed = 3; break;
diff --git a/drivers/net/ethernet/sfc/falcon/farch.c b/drivers/net/ethernet/sfc/falcon/farch.c
index 05916c710d8c..494884f6af4a 100644
--- a/drivers/net/ethernet/sfc/falcon/farch.c
+++ b/drivers/net/ethernet/sfc/falcon/farch.c
@@ -834,7 +834,7 @@ ef4_farch_handle_tx_event(struct ef4_channel *channel, ef4_qword_t *event)
 	struct ef4_nic *efx = channel->efx;
 	int tx_packets = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (likely(EF4_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) {
@@ -990,7 +990,7 @@ ef4_farch_handle_rx_event(struct ef4_channel *channel, const ef4_qword_t *event)
 	struct ef4_rx_queue *rx_queue;
 	struct ef4_nic *efx = channel->efx;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return;
 
 	rx_ev_cont = EF4_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
@@ -1504,7 +1504,7 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx)
 irqreturn_t ef4_farch_legacy_interrupt(int irq, void *dev_id)
 {
 	struct ef4_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	ef4_oword_t *int_ker = efx->irq_status.addr;
 	irqreturn_t result = IRQ_NONE;
 	struct ef4_channel *channel;
@@ -1596,7 +1596,7 @@ irqreturn_t ef4_farch_msi_interrupt(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EF4_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EF4_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Handle non-event-queue sources */
diff --git a/drivers/net/ethernet/sfc/falcon/nic.h b/drivers/net/ethernet/sfc/falcon/nic.h
index a4c4592f6023..54ca457cdb15 100644
--- a/drivers/net/ethernet/sfc/falcon/nic.h
+++ b/drivers/net/ethernet/sfc/falcon/nic.h
@@ -83,7 +83,7 @@ static inline struct ef4_tx_queue *ef4_tx_queue_partner(struct ef4_tx_queue *tx_
 static inline bool __ef4_nic_tx_is_empty(struct ef4_tx_queue *tx_queue,
 					 unsigned int write_count)
 {
-	unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
+	unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count);
 
 	if (empty_read_count == 0)
 		return false;
@@ -464,11 +464,11 @@ irqreturn_t ef4_farch_fatal_interrupt(struct ef4_nic *efx);
 
 static inline int ef4_nic_event_test_irq_cpu(struct ef4_channel *channel)
 {
-	return ACCESS_ONCE(channel->event_test_cpu);
+	return READ_ONCE(channel->event_test_cpu);
 }
 static inline int ef4_nic_irq_test_irq_cpu(struct ef4_nic *efx)
 {
-	return ACCESS_ONCE(efx->last_irq_cpu);
+	return READ_ONCE(efx->last_irq_cpu);
 }
 
 /* Global Resources */
diff --git a/drivers/net/ethernet/sfc/falcon/tx.c b/drivers/net/ethernet/sfc/falcon/tx.c
index 6a75f4140a4b..6486814e97dc 100644
--- a/drivers/net/ethernet/sfc/falcon/tx.c
+++ b/drivers/net/ethernet/sfc/falcon/tx.c
@@ -134,8 +134,8 @@ static void ef4_tx_maybe_stop_queue(struct ef4_tx_queue *txq1)
 	 */
 	netif_tx_stop_queue(txq1->core_txq);
 	smp_mb();
-	txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
-	txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+	txq1->old_read_count = READ_ONCE(txq1->read_count);
+	txq2->old_read_count = READ_ONCE(txq2->read_count);
 
 	fill_level = max(txq1->insert_count - txq1->old_read_count,
 			 txq2->insert_count - txq2->old_read_count);
@@ -524,7 +524,7 @@ void ef4_xmit_done(struct ef4_tx_queue *tx_queue, unsigned int index)
 
 	/* Check whether the hardware queue is now empty */
 	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
-		tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
+		tx_queue->old_write_count = READ_ONCE(tx_queue->write_count);
 		if (tx_queue->read_count == tx_queue->old_write_count) {
 			smp_mb();
 			tx_queue->empty_read_count =
diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index ba45150f53c7..86454d25a405 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -827,7 +827,7 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 	struct efx_nic *efx = channel->efx;
 	int tx_packets = 0;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return 0;
 
 	if (likely(EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_COMP))) {
@@ -979,7 +979,7 @@ efx_farch_handle_rx_event(struct efx_channel *channel, const efx_qword_t *event)
 	struct efx_rx_queue *rx_queue;
 	struct efx_nic *efx = channel->efx;
 
-	if (unlikely(ACCESS_ONCE(efx->reset_pending)))
+	if (unlikely(READ_ONCE(efx->reset_pending)))
 		return;
 
 	rx_ev_cont = EFX_QWORD_FIELD(*event, FSF_AZ_RX_EV_JUMBO_CONT);
@@ -1520,7 +1520,7 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx)
 irqreturn_t efx_farch_legacy_interrupt(int irq, void *dev_id)
 {
 	struct efx_nic *efx = dev_id;
-	bool soft_enabled = ACCESS_ONCE(efx->irq_soft_enabled);
+	bool soft_enabled = READ_ONCE(efx->irq_soft_enabled);
 	efx_oword_t *int_ker = efx->irq_status.addr;
 	irqreturn_t result = IRQ_NONE;
 	struct efx_channel *channel;
@@ -1612,7 +1612,7 @@ irqreturn_t efx_farch_msi_interrupt(int irq, void *dev_id)
 		   "IRQ %d on CPU %d status " EFX_OWORD_FMT "\n",
 		   irq, raw_smp_processor_id(), EFX_OWORD_VAL(*int_ker));
 
-	if (!likely(ACCESS_ONCE(efx->irq_soft_enabled)))
+	if (!likely(READ_ONCE(efx->irq_soft_enabled)))
 		return IRQ_HANDLED;
 
 	/* Handle non-event-queue sources */
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index 4d7fb8af880d..7b51b6371724 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -81,7 +81,7 @@ static struct efx_tx_queue *efx_tx_queue_partner(struct efx_tx_queue *tx_queue)
 static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue,
 					 unsigned int write_count)
 {
-	unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
+	unsigned int empty_read_count = READ_ONCE(tx_queue->empty_read_count);
 
 	if (empty_read_count == 0)
 		return false;
@@ -617,11 +617,11 @@ irqreturn_t efx_farch_fatal_interrupt(struct efx_nic *efx);
 
 static inline int efx_nic_event_test_irq_cpu(struct efx_channel *channel)
 {
-	return ACCESS_ONCE(channel->event_test_cpu);
+	return READ_ONCE(channel->event_test_cpu);
 }
 static inline int efx_nic_irq_test_irq_cpu(struct efx_nic *efx)
 {
-	return ACCESS_ONCE(efx->last_irq_cpu);
+	return READ_ONCE(efx->last_irq_cpu);
 }
 
 /* Global Resources */
diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c
index 60cdb97f58e2..56c2db398def 100644
--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -658,7 +658,7 @@ static void efx_ptp_send_times(struct efx_nic *efx,
 
 	/* Write host time for specified period or until MC is done */
 	while ((timespec64_compare(&now.ts_real, &limit) < 0) &&
-	       ACCESS_ONCE(*mc_running)) {
+	       READ_ONCE(*mc_running)) {
 		struct timespec64 update_time;
 		unsigned int host_time;
 
@@ -668,7 +668,7 @@ static void efx_ptp_send_times(struct efx_nic *efx,
 		do {
 			pps_get_ts(&now);
 		} while ((timespec64_compare(&now.ts_real, &update_time) < 0) &&
-			 ACCESS_ONCE(*mc_running));
+			 READ_ONCE(*mc_running));
 
 		/* Synchronise NIC with single word of time only */
 		host_time = (now.ts_real.tv_sec << MC_NANOSECOND_BITS |
@@ -832,14 +832,14 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
 		       ptp->start.dma_addr);
 
 	/* Clear flag that signals MC ready */
-	ACCESS_ONCE(*start) = 0;
+	WRITE_ONCE(*start, 0);
 	rc = efx_mcdi_rpc_start(efx, MC_CMD_PTP, synch_buf,
 				MC_CMD_PTP_IN_SYNCHRONIZE_LEN);
 	EFX_WARN_ON_ONCE_PARANOID(rc);
 
 	/* Wait for start from MCDI (or timeout) */
 	timeout = jiffies + msecs_to_jiffies(MAX_SYNCHRONISE_WAIT_MS);
-	while (!ACCESS_ONCE(*start) && (time_before(jiffies, timeout))) {
+	while (!READ_ONCE(*start) && (time_before(jiffies, timeout))) {
 		udelay(20);	/* Usually start MCDI execution quickly */
 		loops++;
 	}
@@ -849,7 +849,7 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
 	if (!time_before(jiffies, timeout))
 		++ptp->sync_timeouts;
 
-	if (ACCESS_ONCE(*start))
+	if (READ_ONCE(*start))
 		efx_ptp_send_times(efx, &last_time);
 
 	/* Collect results */
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index 32bf1fecf864..efb66ea21f27 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -136,8 +136,8 @@ static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
 	 */
 	netif_tx_stop_queue(txq1->core_txq);
 	smp_mb();
-	txq1->old_read_count = ACCESS_ONCE(txq1->read_count);
-	txq2->old_read_count = ACCESS_ONCE(txq2->read_count);
+	txq1->old_read_count = READ_ONCE(txq1->read_count);
+	txq2->old_read_count = READ_ONCE(txq2->read_count);
 
 	fill_level = max(txq1->insert_count - txq1->old_read_count,
 			 txq2->insert_count - txq2->old_read_count);
@@ -752,7 +752,7 @@ void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index)
 
 	/* Check whether the hardware queue is now empty */
 	if ((int)(tx_queue->read_count - tx_queue->old_write_count) >= 0) {
-		tx_queue->old_write_count = ACCESS_ONCE(tx_queue->write_count);
+		tx_queue->old_write_count = READ_ONCE(tx_queue->write_count);
 		if (tx_queue->read_count == tx_queue->old_write_count) {
 			smp_mb();
 			tx_queue->empty_read_count =
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index 6a4e8e1bbd90..8ab0fb6892d5 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6245,7 +6245,7 @@ static void niu_get_rx_stats(struct niu *np,
 
 	pkts = dropped = errors = bytes = 0;
 
-	rx_rings = ACCESS_ONCE(np->rx_rings);
+	rx_rings = READ_ONCE(np->rx_rings);
 	if (!rx_rings)
 		goto no_rings;
 
@@ -6276,7 +6276,7 @@ static void niu_get_tx_stats(struct niu *np,
 
 	pkts = errors = bytes = 0;
 
-	tx_rings = ACCESS_ONCE(np->tx_rings);
+	tx_rings = READ_ONCE(np->tx_rings);
 	if (!tx_rings)
 		goto no_rings;
 
diff --git a/drivers/net/tap.c b/drivers/net/tap.c
index 21b71ae947fd..b55b29b90b88 100644
--- a/drivers/net/tap.c
+++ b/drivers/net/tap.c
@@ -257,7 +257,7 @@ static struct tap_queue *tap_get_queue(struct tap_dev *tap,
 	 * and validate that the result isn't NULL - in case we are
 	 * racing against queue removal.
 	 */
-	int numvtaps = ACCESS_ONCE(tap->numvtaps);
+	int numvtaps = READ_ONCE(tap->numvtaps);
 	__u32 rxq;
 
 	if (!numvtaps)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e21bf90b819f..27cd50c5bc9e 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -469,7 +469,7 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u32 numqueues = 0;
 
 	rcu_read_lock();
-	numqueues = ACCESS_ONCE(tun->numqueues);
+	numqueues = READ_ONCE(tun->numqueues);
 
 	txq = __skb_get_hash_symmetric(skb);
 	if (txq) {
@@ -864,7 +864,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	rcu_read_lock();
 	tfile = rcu_dereference(tun->tfiles[txq]);
-	numqueues = ACCESS_ONCE(tun->numqueues);
+	numqueues = READ_ONCE(tun->numqueues);
 
 	/* Drop packet if interface is not attached */
 	if (txq >= numqueues)
diff --git a/drivers/net/wireless/ath/ath5k/desc.c b/drivers/net/wireless/ath/ath5k/desc.c
index bd8d4392d68b..80f75139495f 100644
--- a/drivers/net/wireless/ath/ath5k/desc.c
+++ b/drivers/net/wireless/ath/ath5k/desc.c
@@ -500,13 +500,13 @@ ath5k_hw_proc_4word_tx_status(struct ath5k_hw *ah,
 
 	tx_status = &desc->ud.ds_tx5212.tx_stat;
 
-	txstat1 = ACCESS_ONCE(tx_status->tx_status_1);
+	txstat1 = READ_ONCE(tx_status->tx_status_1);
 
 	/* No frame has been send or error */
 	if (unlikely(!(txstat1 & AR5K_DESC_TX_STATUS1_DONE)))
 		return -EINPROGRESS;
 
-	txstat0 = ACCESS_ONCE(tx_status->tx_status_0);
+	txstat0 = READ_ONCE(tx_status->tx_status_0);
 
 	/*
 	 * Get descriptor status
@@ -700,14 +700,14 @@ ath5k_hw_proc_5212_rx_status(struct ath5k_hw *ah,
 	u32 rxstat0, rxstat1;
 
 	rx_status = &desc->ud.ds_rx.rx_stat;
-	rxstat1 = ACCESS_ONCE(rx_status->rx_status_1);
+	rxstat1 = READ_ONCE(rx_status->rx_status_1);
 
 	/* No frame received / not ready */
 	if (unlikely(!(rxstat1 & AR5K_5212_RX_DESC_STATUS1_DONE)))
 		return -EINPROGRESS;
 
 	memset(rs, 0, sizeof(struct ath5k_rx_status));
-	rxstat0 = ACCESS_ONCE(rx_status->rx_status_0);
+	rxstat0 = READ_ONCE(rx_status->rx_status_0);
 
 	/*
 	 * Frame receive status
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index 613caca7dc02..785a0f33b7e6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3628,7 +3628,7 @@ static void brcmf_sdio_dataworker(struct work_struct *work)
 
 	bus->dpc_running = true;
 	wmb();
-	while (ACCESS_ONCE(bus->dpc_triggered)) {
+	while (READ_ONCE(bus->dpc_triggered)) {
 		bus->dpc_triggered = false;
 		brcmf_sdio_dpc(bus);
 		bus->idlecount = 0;
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
index 231878969332..0f45f34e39d3 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c
@@ -1118,7 +1118,7 @@ void iwl_mvm_set_hw_ctkill_state(struct iwl_mvm *mvm, bool state)
 static bool iwl_mvm_set_hw_rfkill_state(struct iwl_op_mode *op_mode, bool state)
 {
 	struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode);
-	bool calibrating = ACCESS_ONCE(mvm->calibrating);
+	bool calibrating = READ_ONCE(mvm->calibrating);
 
 	if (state)
 		set_bit(IWL_MVM_STATUS_HW_RFKILL, &mvm->status);
diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
index 6f2e2af23219..6e9d3289b9d0 100644
--- a/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/mvm/tx.c
@@ -652,7 +652,7 @@ int iwl_mvm_tx_skb_non_sta(struct iwl_mvm *mvm, struct sk_buff *skb)
 				return -1;
 		} else if (info.control.vif->type == NL80211_IFTYPE_STATION &&
 			   is_multicast_ether_addr(hdr->addr1)) {
-			u8 ap_sta_id = ACCESS_ONCE(mvmvif->ap_sta_id);
+			u8 ap_sta_id = READ_ONCE(mvmvif->ap_sta_id);
 
 			if (ap_sta_id != IWL_MVM_INVALID_STA)
 				sta_id = ap_sta_id;
@@ -700,7 +700,7 @@ static int iwl_mvm_tx_tso(struct iwl_mvm *mvm, struct sk_buff *skb,
 	snap_ip_tcp = 8 + skb_transport_header(skb) - skb_network_header(skb) +
 		tcp_hdrlen(skb);
 
-	dbg_max_amsdu_len = ACCESS_ONCE(mvm->max_amsdu_len);
+	dbg_max_amsdu_len = READ_ONCE(mvm->max_amsdu_len);
 
 	if (!sta->max_amsdu_len ||
 	    !ieee80211_is_data_qos(hdr->frame_control) ||
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
index a06b6612b658..f25ce3a1ea50 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
@@ -1247,7 +1247,7 @@ restart:
 	spin_lock(&rxq->lock);
 	/* uCode's read index (stored in shared DRAM) indicates the last Rx
 	 * buffer that the driver may process (last buffer filled by ucode). */
-	r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
+	r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
 	i = rxq->read;
 
 	/* W/A 9000 device step A0 wrap-around bug */
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
index 2e3e013ec95a..9ad3f4fe5894 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
@@ -2076,12 +2076,12 @@ static int iwl_trans_pcie_wait_txq_empty(struct iwl_trans *trans, int txq_idx)
 
 	IWL_DEBUG_TX_QUEUES(trans, "Emptying queue %d...\n", txq_idx);
 	txq = trans_pcie->txq[txq_idx];
-	wr_ptr = ACCESS_ONCE(txq->write_ptr);
+	wr_ptr = READ_ONCE(txq->write_ptr);
 
-	while (txq->read_ptr != ACCESS_ONCE(txq->write_ptr) &&
+	while (txq->read_ptr != READ_ONCE(txq->write_ptr) &&
 	       !time_after(jiffies,
 			   now + msecs_to_jiffies(IWL_FLUSH_WAIT_MS))) {
-		u8 write_ptr = ACCESS_ONCE(txq->write_ptr);
+		u8 write_ptr = READ_ONCE(txq->write_ptr);
 
 		if (WARN_ONCE(wr_ptr != write_ptr,
 			      "WR pointer moved while flushing %d -> %d\n",
@@ -2553,7 +2553,7 @@ static u32 iwl_trans_pcie_dump_rbs(struct iwl_trans *trans,
 
 	spin_lock(&rxq->lock);
 
-	r = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
+	r = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num)) & 0x0FFF;
 
 	for (i = rxq->read, j = 0;
 	     i != r && j < allocated_rb_nums;
@@ -2814,7 +2814,7 @@ static struct iwl_trans_dump_data
 		/* Dump RBs is supported only for pre-9000 devices (1 queue) */
 		struct iwl_rxq *rxq = &trans_pcie->rxq[0];
 		/* RBs */
-		num_rbs = le16_to_cpu(ACCESS_ONCE(rxq->rb_stts->closed_rb_num))
+		num_rbs = le16_to_cpu(READ_ONCE(rxq->rb_stts->closed_rb_num))
 				      & 0x0FFF;
 		num_rbs = (num_rbs - rxq->read) & RX_QUEUE_MASK;
 		len += num_rbs * (sizeof(*data) +
diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c
index 6467ffac9811..d2b3d6177a55 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -1380,7 +1380,7 @@ static void mac80211_hwsim_tx(struct ieee80211_hw *hw,
 	mac80211_hwsim_monitor_rx(hw, skb, channel);
 
 	/* wmediumd mode check */
-	_portid = ACCESS_ONCE(data->wmediumd);
+	_portid = READ_ONCE(data->wmediumd);
 
 	if (_portid)
 		return mac80211_hwsim_tx_frame_nl(hw, skb, _portid);
@@ -1477,7 +1477,7 @@ static void mac80211_hwsim_tx_frame(struct ieee80211_hw *hw,
 				    struct ieee80211_channel *chan)
 {
 	struct mac80211_hwsim_data *data = hw->priv;
-	u32 _pid = ACCESS_ONCE(data->wmediumd);
+	u32 _pid = READ_ONCE(data->wmediumd);
 
 	if (ieee80211_hw_check(hw, SUPPORTS_RC_TABLE)) {
 		struct ieee80211_tx_info *txi = IEEE80211_SKB_CB(skb);
diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c
index f05cfc83c9c8..f946bf889015 100644
--- a/drivers/scsi/qla2xxx/qla_target.c
+++ b/drivers/scsi/qla2xxx/qla_target.c
@@ -996,7 +996,7 @@ static void qlt_free_session_done(struct work_struct *work)
 	if (logout_started) {
 		bool traced = false;
 
-		while (!ACCESS_ONCE(sess->logout_completed)) {
+		while (!READ_ONCE(sess->logout_completed)) {
 			if (!traced) {
 				ql_dbg(ql_dbg_tgt_mgt, vha, 0xf086,
 					"%s: waiting for sess %p logout\n",
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 942d094269fb..9469695f5871 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -985,7 +985,7 @@ static unsigned int tcmu_handle_completions(struct tcmu_dev *udev)
 	mb = udev->mb_addr;
 	tcmu_flush_dcache_range(mb, sizeof(*mb));
 
-	while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) {
+	while (udev->cmdr_last_cleaned != READ_ONCE(mb->cmd_tail)) {
 
 		struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned;
 		struct tcmu_cmd *cmd;
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 3e865dbf878c..fbaa2a90d25d 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -483,7 +483,7 @@ static ssize_t wdm_read
 	if (rv < 0)
 		return -ERESTARTSYS;
 
-	cntr = ACCESS_ONCE(desc->length);
+	cntr = READ_ONCE(desc->length);
 	if (cntr == 0) {
 		desc->read = 0;
 retry:
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e9326f31db8d..4ae667d8c238 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -150,7 +150,7 @@ static int usbfs_increase_memory_usage(u64 amount)
 {
 	u64 lim;
 
-	lim = ACCESS_ONCE(usbfs_memory_mb);
+	lim = READ_ONCE(usbfs_memory_mb);
 	lim <<= 20;
 
 	atomic64_add(amount, &usbfs_memory_usage);
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index d930bfda4010..58d59c5f8592 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -973,7 +973,7 @@ static ssize_t interface_show(struct device *dev, struct device_attribute *attr,
 	char *string;
 
 	intf = to_usb_interface(dev);
-	string = ACCESS_ONCE(intf->cur_altsetting->string);
+	string = READ_ONCE(intf->cur_altsetting->string);
 	if (!string)
 		return 0;
 	return sprintf(buf, "%s\n", string);
@@ -989,7 +989,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
 
 	intf = to_usb_interface(dev);
 	udev = interface_to_usbdev(intf);
-	alt = ACCESS_ONCE(intf->cur_altsetting);
+	alt = READ_ONCE(intf->cur_altsetting);
 
 	return sprintf(buf, "usb:v%04Xp%04Xd%04Xdc%02Xdsc%02Xdp%02X"
 			"ic%02Xisc%02Xip%02Xin%02X\n",
diff --git a/drivers/usb/gadget/udc/gr_udc.c b/drivers/usb/gadget/udc/gr_udc.c
index 1f9941145746..0b59fa50aa30 100644
--- a/drivers/usb/gadget/udc/gr_udc.c
+++ b/drivers/usb/gadget/udc/gr_udc.c
@@ -1261,7 +1261,7 @@ static int gr_handle_in_ep(struct gr_ep *ep)
 	if (!req->last_desc)
 		return 0;
 
-	if (ACCESS_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN)
+	if (READ_ONCE(req->last_desc->ctrl) & GR_DESC_IN_CTRL_EN)
 		return 0; /* Not put in hardware buffers yet */
 
 	if (gr_read32(&ep->regs->epstat) & (GR_EPSTAT_B1 | GR_EPSTAT_B0))
@@ -1290,7 +1290,7 @@ static int gr_handle_out_ep(struct gr_ep *ep)
 	if (!req->curr_desc)
 		return 0;
 
-	ctrl = ACCESS_ONCE(req->curr_desc->ctrl);
+	ctrl = READ_ONCE(req->curr_desc->ctrl);
 	if (ctrl & GR_DESC_OUT_CTRL_EN)
 		return 0; /* Not received yet */
 
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 44924824fa41..c86f89babd57 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -785,7 +785,7 @@ static void io_watchdog_func(unsigned long _ohci)
 		}
 
 		/* find the last TD processed by the controller. */
-		head = hc32_to_cpu(ohci, ACCESS_ONCE(ed->hwHeadP)) & TD_MASK;
+		head = hc32_to_cpu(ohci, READ_ONCE(ed->hwHeadP)) & TD_MASK;
 		td_start = td;
 		td_next = list_prepare_entry(td, &ed->td_list, td_list);
 		list_for_each_entry_continue(td_next, &ed->td_list, td_list) {
diff --git a/drivers/usb/host/uhci-hcd.h b/drivers/usb/host/uhci-hcd.h
index 91b22b2ea3aa..09a2a259941b 100644
--- a/drivers/usb/host/uhci-hcd.h
+++ b/drivers/usb/host/uhci-hcd.h
@@ -186,7 +186,7 @@ struct uhci_qh {
  * We need a special accessor for the element pointer because it is
  * subject to asynchronous updates by the controller.
  */
-#define qh_element(qh)		ACCESS_ONCE((qh)->element)
+#define qh_element(qh)		READ_ONCE((qh)->element)
 
 #define LINK_TO_QH(uhci, qh)	(UHCI_PTR_QH((uhci)) | \
 				cpu_to_hc32((uhci), (qh)->dma_handle))
@@ -274,7 +274,7 @@ struct uhci_td {
  * subject to asynchronous updates by the controller.
  */
 #define td_status(uhci, td)		hc32_to_cpu((uhci), \
-						ACCESS_ONCE((td)->status))
+						READ_ONCE((td)->status))
 
 #define LINK_TO_TD(uhci, td)		(cpu_to_hc32((uhci), (td)->dma_handle))
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index f5a86f651f38..2bc3705a99bd 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -665,7 +665,7 @@ static int vfio_dev_viable(struct device *dev, void *data)
 {
 	struct vfio_group *group = data;
 	struct vfio_device *device;
-	struct device_driver *drv = ACCESS_ONCE(dev->driver);
+	struct device_driver *drv = READ_ONCE(dev->driver);
 	struct vfio_unbound_dev *unbound;
 	int ret = -EINVAL;
 
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 046f6d280af5..35e929f132e8 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -929,7 +929,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq)
 			continue;
 		}
 
-		tpg = ACCESS_ONCE(vs_tpg[*target]);
+		tpg = READ_ONCE(vs_tpg[*target]);
 		if (unlikely(!tpg)) {
 			/* Target does not exist, fail the request */
 			vhost_scsi_send_bad_target(vs, vq, head, out);
diff --git a/fs/aio.c b/fs/aio.c
index 5a2487217072..e6de7715228c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -576,7 +576,7 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 	 * actually has a cancel function, hence the cmpxchg()
 	 */
 
-	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	cancel = READ_ONCE(kiocb->ki_cancel);
 	do {
 		if (!cancel || cancel == KIOCB_CANCELLED)
 			return -EINVAL;
diff --git a/fs/buffer.c b/fs/buffer.c
index 170df856bdb9..32ce01f0f95f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1692,7 +1692,8 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
 	BUG_ON(!PageLocked(page));
 
 	if (!page_has_buffers(page))
-		create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+		create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
+				     b_state);
 	return page_buffers(page);
 }
 
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 8e704d12a1cf..0083bd4fcaa5 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -373,7 +373,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
 	struct fscrypt_info *prev;
 
 	if (ci == NULL)
-		ci = ACCESS_ONCE(inode->i_crypt_info);
+		ci = READ_ONCE(inode->i_crypt_info);
 	if (ci == NULL)
 		return;
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b53e66d9abd7..98fe1325da9d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1152,7 +1152,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		      get_block_t get_block, dio_iodone_t end_io,
 		      dio_submit_t submit_io, int flags)
 {
-	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+	unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
 	unsigned blkbits = i_blkbits;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 3e14ba25f678..1d6243d9f2b6 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1911,7 +1911,7 @@ void set_dumpable(struct mm_struct *mm, int value)
 		return;
 
 	do {
-		old = ACCESS_ONCE(mm->flags);
+		old = READ_ONCE(mm->flags);
 		new = (old & ~MMF_DUMPABLE_MASK) | value;
 	} while (cmpxchg(&mm->flags, old, new) != old);
 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 448a1119f0be..57bf2964bb83 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -724,7 +724,7 @@ static void send_sigio_to_task(struct task_struct *p,
 	 * F_SETSIG can change ->signum lockless in parallel, make
 	 * sure we read it once and use the same value throughout.
 	 */
-	int signum = ACCESS_ONCE(fown->signum);
+	int signum = READ_ONCE(fown->signum);
 
 	if (!sigio_perm(p, fown, signum))
 		return;
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index e747b3d720ee..2d07f292b625 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -78,7 +78,7 @@ void mnt_pin_kill(struct mount *m)
 	while (1) {
 		struct hlist_node *p;
 		rcu_read_lock();
-		p = ACCESS_ONCE(m->mnt_pins.first);
+		p = READ_ONCE(m->mnt_pins.first);
 		if (!p) {
 			rcu_read_unlock();
 			break;
@@ -92,7 +92,7 @@ void group_pin_kill(struct hlist_head *p)
 	while (1) {
 		struct hlist_node *q;
 		rcu_read_lock();
-		q = ACCESS_ONCE(p->first);
+		q = READ_ONCE(p->first);
 		if (!q) {
 			rcu_read_unlock();
 			break;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 13c65dd2d37d..a42d89371748 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -33,7 +33,7 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
 	 * Lockless access is OK, because file->private data is set
 	 * once during mount and is valid until the file is released.
 	 */
-	return ACCESS_ONCE(file->private_data);
+	return READ_ONCE(file->private_data);
 }
 
 static void fuse_request_init(struct fuse_req *req, struct page **pages,
diff --git a/fs/inode.c b/fs/inode.c
index d1e35b53bb23..fd401028a309 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2090,7 +2090,7 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
 
 	WARN_ON_ONCE(flags & ~mask);
 	do {
-		old_flags = ACCESS_ONCE(inode->i_flags);
+		old_flags = READ_ONCE(inode->i_flags);
 		new_flags = (old_flags & ~mask) | flags;
 	} while (unlikely(cmpxchg(&inode->i_flags, old_flags,
 				  new_flags) != old_flags));
diff --git a/fs/namei.c b/fs/namei.c
index c75ea03ca147..40a0f34bf990 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1209,7 +1209,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 	/* Given that we're not holding a lock here, we retain the value in a
 	 * local variable for each dentry as we look at it so that we don't see
 	 * the components of that value change under us */
-	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	while (managed = READ_ONCE(path->dentry->d_flags),
 	       managed &= DCACHE_MANAGED_DENTRY,
 	       unlikely(managed != 0)) {
 		/* Allow the filesystem to manage the transit without i_mutex
@@ -1394,7 +1394,7 @@ int follow_down(struct path *path)
 	unsigned managed;
 	int ret;
 
-	while (managed = ACCESS_ONCE(path->dentry->d_flags),
+	while (managed = READ_ONCE(path->dentry->d_flags),
 	       unlikely(managed & DCACHE_MANAGED_DENTRY)) {
 		/* Allow the filesystem to manage the transit without i_mutex
 		 * being held.
diff --git a/fs/namespace.c b/fs/namespace.c
index d18deb4c410b..e158ec6b527b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -353,7 +353,7 @@ int __mnt_want_write(struct vfsmount *m)
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
+	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5ceaeb1f6fb6..f439f1c45008 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1081,7 +1081,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	int error;
 
 	if (flags & LOOKUP_RCU) {
-		parent = ACCESS_ONCE(dentry->d_parent);
+		parent = READ_ONCE(dentry->d_parent);
 		dir = d_inode_rcu(parent);
 		if (!dir)
 			return -ECHILD;
@@ -1168,7 +1168,7 @@ out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
 	if (flags & LOOKUP_RCU) {
-		if (parent != ACCESS_ONCE(dentry->d_parent))
+		if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 	} else
 		dput(parent);
@@ -1582,7 +1582,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		struct inode *dir;
 
 		if (flags & LOOKUP_RCU) {
-			parent = ACCESS_ONCE(dentry->d_parent);
+			parent = READ_ONCE(dentry->d_parent);
 			dir = d_inode_rcu(parent);
 			if (!dir)
 				return -ECHILD;
@@ -1596,7 +1596,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 			ret = -ECHILD;
 		if (!(flags & LOOKUP_RCU))
 			dput(parent);
-		else if (parent != ACCESS_ONCE(dentry->d_parent))
+		else if (parent != READ_ONCE(dentry->d_parent))
 			return -ECHILD;
 		goto out;
 	}
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 77a8eacbe032..375e8bf0dd24 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -453,7 +453,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		cutime = sig->cutime;
 		cstime = sig->cstime;
 		cgtime = sig->cgtime;
-		rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
+		rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
 
 		/* add up live thread stats at the group level */
 		if (whole) {
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 99dff222fe67..03afd5150916 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -27,7 +27,7 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 
 	poll_wait(file, &p->ns->poll, wait);
 
-	event = ACCESS_ONCE(ns->event);
+	event = READ_ONCE(ns->event);
 	if (m->poll_event != event) {
 		m->poll_event = event;
 		res |= POLLERR | POLLPRI;
diff --git a/fs/splice.c b/fs/splice.c
index f3084cce0ea6..39e2dc01ac12 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -253,7 +253,7 @@ EXPORT_SYMBOL(add_to_pipe);
  */
 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
 {
-	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
+	unsigned int buffers = READ_ONCE(pipe->buffers);
 
 	spd->nr_pages_max = buffers;
 	if (buffers <= PIPE_DEF_BUFFERS)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 1c713fd5b3e6..f46d133c0949 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -381,7 +381,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	 * in __get_user_pages if userfaultfd_release waits on the
 	 * caller of handle_userfault to release the mmap_sem.
 	 */
-	if (unlikely(ACCESS_ONCE(ctx->released))) {
+	if (unlikely(READ_ONCE(ctx->released))) {
 		/*
 		 * Don't return VM_FAULT_SIGBUS in this case, so a non
 		 * cooperative manager can close the uffd after the
@@ -477,7 +477,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 						       vmf->flags, reason);
 	up_read(&mm->mmap_sem);
 
-	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
+	if (likely(must_wait && !READ_ONCE(ctx->released) &&
 		   (return_to_userland ? !signal_pending(current) :
 		    !fatal_signal_pending(current)))) {
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
@@ -586,7 +586,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
 		set_current_state(TASK_KILLABLE);
 		if (ewq->msg.event == 0)
 			break;
-		if (ACCESS_ONCE(ctx->released) ||
+		if (READ_ONCE(ctx->released) ||
 		    fatal_signal_pending(current)) {
 			/*
 			 * &ewq->wq may be queued in fork_event, but
@@ -833,7 +833,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
 	struct userfaultfd_wake_range range = { .len = 0, };
 	unsigned long new_flags;
 
-	ACCESS_ONCE(ctx->released) = true;
+	WRITE_ONCE(ctx->released, true);
 
 	if (!mmget_not_zero(mm))
 		goto wakeup;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 51bf7b827387..129975970d99 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -592,9 +592,9 @@ xlog_valid_lsn(
 	 * a transiently forward state. Instead, we can see the LSN in a
 	 * transiently behind state if we happen to race with a cycle wrap.
 	 */
-	cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+	cur_cycle = READ_ONCE(log->l_curr_cycle);
 	smp_rmb();
-	cur_block = ACCESS_ONCE(log->l_curr_block);
+	cur_block = READ_ONCE(log->l_curr_block);
 
 	if ((CYCLE_LSN(lsn) > cur_cycle) ||
 	    (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 8fbe259b197c..0a7ce668f8e0 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -236,7 +236,7 @@ static inline unsigned long __ffs64(u64 word)
 	typeof(*ptr) old, new;					\
 								\
 	do {							\
-		old = ACCESS_ONCE(*ptr);			\
+		old = READ_ONCE(*ptr);			\
 		new = (old & ~mask) | bits;			\
 	} while (cmpxchg(ptr, old, new) != old);		\
 								\
@@ -251,7 +251,7 @@ static inline unsigned long __ffs64(u64 word)
 	typeof(*ptr) old, new;					\
 								\
 	do {							\
-		old = ACCESS_ONCE(*ptr);			\
+		old = READ_ONCE(*ptr);			\
 		new = old & ~clear;				\
 	} while (!(old & test) &&				\
 		 cmpxchg(ptr, old, new) != old);		\
diff --git a/include/linux/dynamic_queue_limits.h b/include/linux/dynamic_queue_limits.h
index a4be70398ce1..36dd4ffb5715 100644
--- a/include/linux/dynamic_queue_limits.h
+++ b/include/linux/dynamic_queue_limits.h
@@ -88,7 +88,7 @@ static inline void dql_queued(struct dql *dql, unsigned int count)
 /* Returns how many objects can be queued, < 0 indicates over limit. */
 static inline int dql_avail(const struct dql *dql)
 {
-	return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued);
+	return READ_ONCE(dql->adj_limit) - READ_ONCE(dql->num_queued);
 }
 
 /* Record number of completed objects and recalculate the limit. */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 14bc21c2ee7f..785a00ca4628 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -221,7 +221,7 @@ extern struct page *huge_zero_page;
 
 static inline bool is_huge_zero_page(struct page *page)
 {
-	return ACCESS_ONCE(huge_zero_page) == page;
+	return READ_ONCE(huge_zero_page) == page;
 }
 
 static inline bool is_huge_zero_pmd(pmd_t pmd)
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 30294603526f..d95cae09dea0 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -247,7 +247,7 @@ static inline struct team_port *team_get_port_by_index(struct team *team,
 
 static inline int team_num_to_port_index(struct team *team, unsigned int num)
 {
-	int en_port_count = ACCESS_ONCE(team->en_port_count);
+	int en_port_count = READ_ONCE(team->en_port_count);
 
 	if (unlikely(!en_port_count))
 		return 0;
diff --git a/include/linux/llist.h b/include/linux/llist.h
index 1957635e6d5f..85abc2915e8d 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -198,7 +198,7 @@ static inline void init_llist_head(struct llist_head *list)
  */
 static inline bool llist_empty(const struct llist_head *head)
 {
-	return ACCESS_ONCE(head->first) == NULL;
+	return READ_ONCE(head->first) == NULL;
 }
 
 static inline struct llist_node *llist_next(struct llist_node *node)
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2efb08a60e63..f0fc4700b6ff 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -105,7 +105,7 @@ static inline bool pm_runtime_callbacks_present(struct device *dev)
 
 static inline void pm_runtime_mark_last_busy(struct device *dev)
 {
-	ACCESS_ONCE(dev->power.last_busy) = jiffies;
+	WRITE_ONCE(dev->power.last_busy, jiffies);
 }
 
 static inline bool pm_runtime_is_irq_safe(struct device *dev)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 4f4f786255ef..3fadb6f9982b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -983,12 +983,12 @@ static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
 
 static inline int sysctl_sync_period(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_threshold[1]);
+	return READ_ONCE(ipvs->sysctl_sync_threshold[1]);
 }
 
 static inline unsigned int sysctl_sync_refresh_period(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_refresh_period);
+	return READ_ONCE(ipvs->sysctl_sync_refresh_period);
 }
 
 static inline int sysctl_sync_retries(struct netns_ipvs *ipvs)
@@ -1013,7 +1013,7 @@ static inline int sysctl_sloppy_sctp(struct netns_ipvs *ipvs)
 
 static inline int sysctl_sync_ports(struct netns_ipvs *ipvs)
 {
-	return ACCESS_ONCE(ipvs->sysctl_sync_ports);
+	return READ_ONCE(ipvs->sysctl_sync_ports);
 }
 
 static inline int sysctl_sync_persist_mode(struct netns_ipvs *ipvs)
diff --git a/kernel/acct.c b/kernel/acct.c
index 5e72af29ab73..21eedd0dd81a 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -146,7 +146,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
 again:
 	smp_rmb();
 	rcu_read_lock();
-	res = to_acct(ACCESS_ONCE(ns->bacct));
+	res = to_acct(READ_ONCE(ns->bacct));
 	if (!res) {
 		rcu_read_unlock();
 		return NULL;
@@ -158,7 +158,7 @@ again:
 	}
 	rcu_read_unlock();
 	mutex_lock(&res->lock);
-	if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+	if (res != to_acct(READ_ONCE(ns->bacct))) {
 		mutex_unlock(&res->lock);
 		acct_put(res);
 		goto again;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 824a583079a1..8fd2f2d1358a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1200,7 +1200,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
 
 again:
 	rcu_read_lock();
-	ctx = ACCESS_ONCE(event->ctx);
+	ctx = READ_ONCE(event->ctx);
 	if (!atomic_inc_not_zero(&ctx->refcount)) {
 		rcu_read_unlock();
 		goto again;
@@ -5302,8 +5302,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		if (!rb)
 			goto aux_unlock;
 
-		aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
-		aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+		aux_offset = READ_ONCE(rb->user_page->aux_offset);
+		aux_size = READ_ONCE(rb->user_page->aux_size);
 
 		if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
 			goto aux_unlock;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..f3e37971c842 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	 * (B) <-> (C) ordering is still observed by the pmu driver.
 	 */
 	if (!rb->aux_overwrite) {
-		aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+		aux_tail = READ_ONCE(rb->user_page->aux_tail);
 		handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
 		if (aux_head - aux_tail < perf_aux_size(rb))
 			handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	 * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
 	 * can't confuse the checks below.
 	 */
-	int exit_state = ACCESS_ONCE(p->exit_state);
+	int exit_state = READ_ONCE(p->exit_state);
 	int ret;
 
 	if (unlikely(exit_state == EXIT_DEAD))
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..845f3805c73d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	 * if it happened, we have to fail the write.
 	 */
 	barrier();
-	if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+	if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
 		local_dec(&cpu_buffer->committing);
 		local_dec(&cpu_buffer->commits);
 		return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 652c682707cd..9050c8b3ccde 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1459,7 +1459,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
 
 static inline void *event_file_data(struct file *filp)
 {
-	return ACCESS_ONCE(file_inode(filp)->i_private);
+	return READ_ONCE(file_inode(filp)->i_private);
 }
 
 extern struct mutex event_mutex;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 49cb41412eec..780262210c9a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -77,7 +77,7 @@ check_stack(unsigned long ip, unsigned long *stack)
 {
 	unsigned long this_size, flags; unsigned long *p, *top, *start;
 	static int tracer_frame;
-	int frame_size = ACCESS_ONCE(tracer_frame);
+	int frame_size = READ_ONCE(tracer_frame);
 	int i, x;
 
 	this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..d32b45662fb6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file,
 int proc_setgroups_show(struct seq_file *seq, void *v)
 {
 	struct user_namespace *ns = seq->private;
-	unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+	unsigned long userns_flags = READ_ONCE(ns->flags);
 
 	seq_printf(seq, "%s\n",
 		   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
diff --git a/lib/assoc_array.c b/lib/assoc_array.c
index 155c55d8db5f..fe7953aead82 100644
--- a/lib/assoc_array.c
+++ b/lib/assoc_array.c
@@ -39,7 +39,7 @@ begin_node:
 		/* Descend through a shortcut */
 		shortcut = assoc_array_ptr_to_shortcut(cursor);
 		smp_read_barrier_depends();
-		cursor = ACCESS_ONCE(shortcut->next_node);
+		cursor = READ_ONCE(shortcut->next_node);
 	}
 
 	node = assoc_array_ptr_to_node(cursor);
@@ -55,7 +55,7 @@ begin_node:
 	 */
 	has_meta = 0;
 	for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		has_meta |= (unsigned long)ptr;
 		if (ptr && assoc_array_ptr_is_leaf(ptr)) {
 			/* We need a barrier between the read of the pointer
@@ -89,7 +89,7 @@ continue_node:
 	smp_read_barrier_depends();
 
 	for (; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		if (assoc_array_ptr_is_meta(ptr)) {
 			cursor = ptr;
 			goto begin_node;
@@ -98,7 +98,7 @@ continue_node:
 
 finished_node:
 	/* Move up to the parent (may need to skip back over a shortcut) */
-	parent = ACCESS_ONCE(node->back_pointer);
+	parent = READ_ONCE(node->back_pointer);
 	slot = node->parent_slot;
 	if (parent == stop)
 		return 0;
@@ -107,7 +107,7 @@ finished_node:
 		shortcut = assoc_array_ptr_to_shortcut(parent);
 		smp_read_barrier_depends();
 		cursor = parent;
-		parent = ACCESS_ONCE(shortcut->back_pointer);
+		parent = READ_ONCE(shortcut->back_pointer);
 		slot = shortcut->parent_slot;
 		if (parent == stop)
 			return 0;
@@ -147,7 +147,7 @@ int assoc_array_iterate(const struct assoc_array *array,
 					void *iterator_data),
 			void *iterator_data)
 {
-	struct assoc_array_ptr *root = ACCESS_ONCE(array->root);
+	struct assoc_array_ptr *root = READ_ONCE(array->root);
 
 	if (!root)
 		return 0;
@@ -194,7 +194,7 @@ assoc_array_walk(const struct assoc_array *array,
 
 	pr_devel("-->%s()\n", __func__);
 
-	cursor = ACCESS_ONCE(array->root);
+	cursor = READ_ONCE(array->root);
 	if (!cursor)
 		return assoc_array_walk_tree_empty;
 
@@ -220,7 +220,7 @@ consider_node:
 
 	slot = segments >> (level & ASSOC_ARRAY_KEY_CHUNK_MASK);
 	slot &= ASSOC_ARRAY_FAN_MASK;
-	ptr = ACCESS_ONCE(node->slots[slot]);
+	ptr = READ_ONCE(node->slots[slot]);
 
 	pr_devel("consider slot %x [ix=%d type=%lu]\n",
 		 slot, level, (unsigned long)ptr & 3);
@@ -294,7 +294,7 @@ follow_shortcut:
 	} while (sc_level < shortcut->skip_to_level);
 
 	/* The shortcut matches the leaf's index to this point. */
-	cursor = ACCESS_ONCE(shortcut->next_node);
+	cursor = READ_ONCE(shortcut->next_node);
 	if (((level ^ sc_level) & ~ASSOC_ARRAY_KEY_CHUNK_MASK) != 0) {
 		level = sc_level;
 		goto jumped;
@@ -337,7 +337,7 @@ void *assoc_array_find(const struct assoc_array *array,
 	 * the terminal node.
 	 */
 	for (slot = 0; slot < ASSOC_ARRAY_FAN_OUT; slot++) {
-		ptr = ACCESS_ONCE(node->slots[slot]);
+		ptr = READ_ONCE(node->slots[slot]);
 		if (ptr && assoc_array_ptr_is_leaf(ptr)) {
 			/* We need a barrier between the read of the pointer
 			 * and dereferencing the pointer - but only if we are
diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c
index f346715e2255..81770a55cb16 100644
--- a/lib/dynamic_queue_limits.c
+++ b/lib/dynamic_queue_limits.c
@@ -20,7 +20,7 @@ void dql_completed(struct dql *dql, unsigned int count)
 	unsigned int ovlimit, completed, num_queued;
 	bool all_prev_completed;
 
-	num_queued = ACCESS_ONCE(dql->num_queued);
+	num_queued = READ_ONCE(dql->num_queued);
 
 	/* Can't complete more than what's in queue */
 	BUG_ON(count > num_queued - dql->num_completed);
diff --git a/lib/llist.c b/lib/llist.c
index ae5872b1df0c..7062e931a7bb 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -41,7 +41,7 @@ bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
 	struct llist_node *first;
 
 	do {
-		new_last->next = first = ACCESS_ONCE(head->first);
+		new_last->next = first = READ_ONCE(head->first);
 	} while (cmpxchg(&head->first, first, new_first) != first);
 
 	return !first;
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 86c3385b9eb3..1746bae94d41 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -620,8 +620,8 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp
 
 	rcu_read_lock();
 	for (i = 0; i < depth; i++, d = p) {
-		p = ACCESS_ONCE(d->d_parent);
-		array[i] = ACCESS_ONCE(d->d_name.name);
+		p = READ_ONCE(d->d_parent);
+		array[i] = READ_ONCE(d->d_name.name);
 		if (p == d) {
 			if (i)
 				array[i] = "";
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 269b5df58543..c3bf907a03ee 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2715,7 +2715,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
 		struct shrink_control *sc)
 {
 	struct pglist_data *pgdata = NODE_DATA(sc->nid);
-	return ACCESS_ONCE(pgdata->split_queue_len);
+	return READ_ONCE(pgdata->split_queue_len);
 }
 
 static unsigned long deferred_split_scan(struct shrinker *shrink,
diff --git a/net/core/dev.c b/net/core/dev.c
index 11596a302a26..61559ca3980b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3725,7 +3725,7 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
 	if (flow_table && flow_id <= flow_table->mask) {
 		rflow = &flow_table->flows[flow_id];
-		cpu = ACCESS_ONCE(rflow->cpu);
+		cpu = READ_ONCE(rflow->cpu);
 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
 			   rflow->last_qtail) <
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6e1e10ff433a..3b2034f6d49d 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3377,7 +3377,7 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
 
 static void pktgen_xmit(struct pktgen_dev *pkt_dev)
 {
-	unsigned int burst = ACCESS_ONCE(pkt_dev->burst);
+	unsigned int burst = READ_ONCE(pkt_dev->burst);
 	struct net_device *odev = pkt_dev->odev;
 	struct netdev_queue *txq;
 	struct sk_buff *skb;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index af74d0433453..f9597ba26599 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -164,7 +164,7 @@ static void inet_frag_worker(struct work_struct *work)
 
 	local_bh_disable();
 
-	for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) {
+	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
 		evicted += inet_evict_bucket(f, &f->hash[i]);
 		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
 		if (evicted > INETFRAGS_EVICT_MAX)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3d9f1c2f81c5..c0864562083b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -495,7 +495,7 @@ u32 ip_idents_reserve(u32 hash, int segs)
 {
 	u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 	atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
-	u32 old = ACCESS_ONCE(*p_tstamp);
+	u32 old = READ_ONCE(*p_tstamp);
 	u32 now = (u32)jiffies;
 	u32 new, delta = 0;
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0bc9e46a5369..48531da1aba6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1908,7 +1908,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
 	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
 		goto send_now;
 
-	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
+	win_divisor = READ_ONCE(sysctl_tcp_tso_win_divisor);
 	if (win_divisor) {
 		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ebfbccae62fd..02ec9a349303 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1853,7 +1853,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		 */
 
 		/* if we're overly short, let UDP handle it */
-		encap_rcv = ACCESS_ONCE(up->encap_rcv);
+		encap_rcv = READ_ONCE(up->encap_rcv);
 		if (encap_rcv) {
 			int ret;
 
@@ -2298,7 +2298,7 @@ void udp_destroy_sock(struct sock *sk)
 	unlock_sock_fast(sk, slow);
 	if (static_key_false(&udp_encap_needed) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = ACCESS_ONCE(up->encap_destroy);
+		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a1c24443cd9e..dab946554157 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -490,7 +490,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
 	if (!t)
 		goto out;
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if (tproto != ipproto && tproto != 0)
 		goto out;
 
@@ -899,7 +899,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
 	t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
 
 	if (t) {
-		u8 tproto = ACCESS_ONCE(t->parms.proto);
+		u8 tproto = READ_ONCE(t->parms.proto);
 
 		if (tproto != ipproto && tproto != 0)
 			goto drop;
@@ -1233,7 +1233,7 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if (tproto != IPPROTO_IPIP && tproto != 0)
 		return -1;
 
@@ -1303,7 +1303,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
 	u8 tproto;
 	int err;
 
-	tproto = ACCESS_ONCE(t->parms.proto);
+	tproto = READ_ONCE(t->parms.proto);
 	if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
 	    ip6_tnl_addr_conflict(t, ipv6h))
 		return -1;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 40d7234c27b9..3f30fa313bf2 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -606,7 +606,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 		 */
 
 		/* if we're overly short, let UDP handle it */
-		encap_rcv = ACCESS_ONCE(up->encap_rcv);
+		encap_rcv = READ_ONCE(up->encap_rcv);
 		if (encap_rcv) {
 			int ret;
 
@@ -1432,7 +1432,7 @@ void udpv6_destroy_sock(struct sock *sk)
 
 	if (static_key_false(&udpv6_encap_needed) && up->encap_type) {
 		void (*encap_destroy)(struct sock *sk);
-		encap_destroy = ACCESS_ONCE(up->encap_destroy);
+		encap_destroy = READ_ONCE(up->encap_destroy);
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index dd3e83328ad5..82cb93f66b9b 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -193,7 +193,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
 	 */
 	rcv = rcu_dereference(sap->rcv_func);
 	dest = llc_pdu_type(skb);
-	sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL;
+	sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL;
 	if (unlikely(!sap_handler)) {
 		if (rcv)
 			rcv(skb, dev, pt, orig_dev);
@@ -214,7 +214,7 @@ drop:
 	kfree_skb(skb);
 	goto out;
 handle_station:
-	sta_handler = ACCESS_ONCE(llc_station_handler);
+	sta_handler = READ_ONCE(llc_station_handler);
 	if (!sta_handler)
 		goto drop;
 	sta_handler(skb);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 69615016d5bf..214d2ba02877 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2008,7 +2008,7 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate,
 
 static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo)
 {
-	u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate);
+	u16 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate);
 
 	if (rate == STA_STATS_RATE_INVALID)
 		return -EINVAL;
diff --git a/net/netlabel/netlabel_calipso.c b/net/netlabel/netlabel_calipso.c
index d177dd066504..4d748975117d 100644
--- a/net/netlabel/netlabel_calipso.c
+++ b/net/netlabel/netlabel_calipso.c
@@ -393,7 +393,7 @@ EXPORT_SYMBOL(netlbl_calipso_ops_register);
 
 static const struct netlbl_calipso_ops *netlbl_calipso_ops_get(void)
 {
-	return ACCESS_ONCE(calipso_ops);
+	return READ_ONCE(calipso_ops);
 }
 
 /**
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d396cb61a280..eb866647a27a 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -14201,7 +14201,7 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd,
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct sk_buff *msg;
 	void *hdr;
-	u32 nlportid = ACCESS_ONCE(wdev->ap_unexpected_nlportid);
+	u32 nlportid = READ_ONCE(wdev->ap_unexpected_nlportid);
 
 	if (!nlportid)
 		return false;
diff --git a/sound/firewire/amdtp-am824.c b/sound/firewire/amdtp-am824.c
index 23ccddb20de1..4210e5c6262e 100644
--- a/sound/firewire/amdtp-am824.c
+++ b/sound/firewire/amdtp-am824.c
@@ -247,7 +247,7 @@ void amdtp_am824_midi_trigger(struct amdtp_stream *s, unsigned int port,
 	struct amdtp_am824 *p = s->protocol;
 
 	if (port < p->midi_ports)
-		ACCESS_ONCE(p->midi[port]) = midi;
+		WRITE_ONCE(p->midi[port], midi);
 }
 EXPORT_SYMBOL_GPL(amdtp_am824_midi_trigger);
 
@@ -336,7 +336,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s, __be32 *buffe
 					   unsigned int data_blocks, unsigned int *syt)
 {
 	struct amdtp_am824 *p = s->protocol;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
@@ -357,7 +357,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s, __be32 *buffe
 					   unsigned int data_blocks, unsigned int *syt)
 {
 	struct amdtp_am824 *p = s->protocol;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
diff --git a/sound/firewire/amdtp-stream.c b/sound/firewire/amdtp-stream.c
index 3fc581a5ad62..4a1dc145327b 100644
--- a/sound/firewire/amdtp-stream.c
+++ b/sound/firewire/amdtp-stream.c
@@ -376,7 +376,7 @@ static void update_pcm_pointers(struct amdtp_stream *s,
 	ptr = s->pcm_buffer_pointer + frames;
 	if (ptr >= pcm->runtime->buffer_size)
 		ptr -= pcm->runtime->buffer_size;
-	ACCESS_ONCE(s->pcm_buffer_pointer) = ptr;
+	WRITE_ONCE(s->pcm_buffer_pointer, ptr);
 
 	s->pcm_period_pointer += frames;
 	if (s->pcm_period_pointer >= pcm->runtime->period_size) {
@@ -388,7 +388,7 @@ static void update_pcm_pointers(struct amdtp_stream *s,
 static void pcm_period_tasklet(unsigned long data)
 {
 	struct amdtp_stream *s = (void *)data;
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 
 	if (pcm)
 		snd_pcm_period_elapsed(pcm);
@@ -453,7 +453,7 @@ static int handle_out_packet(struct amdtp_stream *s,
 		s->data_block_counter =
 				(s->data_block_counter + data_blocks) & 0xff;
 
-	buffer[0] = cpu_to_be32(ACCESS_ONCE(s->source_node_id_field) |
+	buffer[0] = cpu_to_be32(READ_ONCE(s->source_node_id_field) |
 				(s->data_block_quadlets << CIP_DBS_SHIFT) |
 				((s->sph << CIP_SPH_SHIFT) & CIP_SPH_MASK) |
 				s->data_block_counter);
@@ -472,7 +472,7 @@ static int handle_out_packet(struct amdtp_stream *s,
 	if (queue_out_packet(s, payload_length) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -504,7 +504,7 @@ static int handle_out_packet_without_header(struct amdtp_stream *s,
 	if (queue_out_packet(s, payload_length) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -621,7 +621,7 @@ end:
 	if (queue_in_packet(s) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -649,7 +649,7 @@ static int handle_in_packet_without_header(struct amdtp_stream *s,
 	if (queue_in_packet(s) < 0)
 		return -EIO;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm && pcm_frames > 0)
 		update_pcm_pointers(s, pcm, pcm_frames);
 
@@ -947,7 +947,7 @@ unsigned long amdtp_stream_pcm_pointer(struct amdtp_stream *s)
 	if (!in_interrupt() && amdtp_stream_running(s))
 		fw_iso_context_flush_completions(s->context);
 
-	return ACCESS_ONCE(s->pcm_buffer_pointer);
+	return READ_ONCE(s->pcm_buffer_pointer);
 }
 EXPORT_SYMBOL(amdtp_stream_pcm_pointer);
 
@@ -977,9 +977,8 @@ EXPORT_SYMBOL(amdtp_stream_pcm_ack);
 void amdtp_stream_update(struct amdtp_stream *s)
 {
 	/* Precomputing. */
-	ACCESS_ONCE(s->source_node_id_field) =
-		(fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) &
-								CIP_SID_MASK;
+	WRITE_ONCE(s->source_node_id_field,
+                   (fw_parent_device(s->unit)->card->node_id << CIP_SID_SHIFT) & CIP_SID_MASK);
 }
 EXPORT_SYMBOL(amdtp_stream_update);
 
@@ -1022,7 +1021,7 @@ void amdtp_stream_pcm_abort(struct amdtp_stream *s)
 {
 	struct snd_pcm_substream *pcm;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		snd_pcm_stop_xrun(pcm);
 }
diff --git a/sound/firewire/amdtp-stream.h b/sound/firewire/amdtp-stream.h
index ed6eafd10992..f9abd8b07ce6 100644
--- a/sound/firewire/amdtp-stream.h
+++ b/sound/firewire/amdtp-stream.h
@@ -220,7 +220,7 @@ static inline bool amdtp_stream_pcm_running(struct amdtp_stream *s)
 static inline void amdtp_stream_pcm_trigger(struct amdtp_stream *s,
 					    struct snd_pcm_substream *pcm)
 {
-	ACCESS_ONCE(s->pcm) = pcm;
+	WRITE_ONCE(s->pcm, pcm);
 }
 
 static inline bool cip_sfc_is_base_44100(enum cip_sfc sfc)
diff --git a/sound/firewire/digi00x/amdtp-dot.c b/sound/firewire/digi00x/amdtp-dot.c
index 1453c34ce99f..4a884a335248 100644
--- a/sound/firewire/digi00x/amdtp-dot.c
+++ b/sound/firewire/digi00x/amdtp-dot.c
@@ -327,7 +327,7 @@ void amdtp_dot_midi_trigger(struct amdtp_stream *s, unsigned int port,
 	struct amdtp_dot *p = s->protocol;
 
 	if (port < MAX_MIDI_PORTS)
-		ACCESS_ONCE(p->midi[port]) = midi;
+		WRITE_ONCE(p->midi[port], midi);
 }
 
 static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
@@ -338,7 +338,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 	struct snd_pcm_substream *pcm;
 	unsigned int pcm_frames;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm) {
 		read_pcm_s32(s, pcm, buffer, data_blocks);
 		pcm_frames = data_blocks;
@@ -359,7 +359,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	struct snd_pcm_substream *pcm;
 	unsigned int pcm_frames;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm) {
 		write_pcm_s32(s, pcm, buffer, data_blocks);
 		pcm_frames = data_blocks;
diff --git a/sound/firewire/fireface/amdtp-ff.c b/sound/firewire/fireface/amdtp-ff.c
index 780da9deb2f0..77c7598b61ab 100644
--- a/sound/firewire/fireface/amdtp-ff.c
+++ b/sound/firewire/fireface/amdtp-ff.c
@@ -108,7 +108,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 					   unsigned int data_blocks,
 					   unsigned int *syt)
 {
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
@@ -127,7 +127,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 					   unsigned int data_blocks,
 					   unsigned int *syt)
 {
-	struct snd_pcm_substream *pcm = ACCESS_ONCE(s->pcm);
+	struct snd_pcm_substream *pcm = READ_ONCE(s->pcm);
 	unsigned int pcm_frames;
 
 	if (pcm) {
diff --git a/sound/firewire/fireface/ff-midi.c b/sound/firewire/fireface/ff-midi.c
index 949ee56b4e0e..6a49611ee462 100644
--- a/sound/firewire/fireface/ff-midi.c
+++ b/sound/firewire/fireface/ff-midi.c
@@ -22,7 +22,7 @@ static int midi_playback_open(struct snd_rawmidi_substream *substream)
 	ff->running_status[substream->number] = 0;
 	ff->rx_midi_error[substream->number] = false;
 
-	ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = substream;
+	WRITE_ONCE(ff->rx_midi_substreams[substream->number], substream);
 
 	return 0;
 }
@@ -38,7 +38,7 @@ static int midi_playback_close(struct snd_rawmidi_substream *substream)
 	struct snd_ff *ff = substream->rmidi->private_data;
 
 	cancel_work_sync(&ff->rx_midi_work[substream->number]);
-	ACCESS_ONCE(ff->rx_midi_substreams[substream->number]) = NULL;
+	WRITE_ONCE(ff->rx_midi_substreams[substream->number], NULL);
 
 	return 0;
 }
@@ -52,10 +52,10 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *substream,
 	spin_lock_irqsave(&ff->lock, flags);
 
 	if (up)
-		ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) =
-								substream;
+		WRITE_ONCE(ff->tx_midi_substreams[substream->number],
+			   substream);
 	else
-		ACCESS_ONCE(ff->tx_midi_substreams[substream->number]) = NULL;
+		WRITE_ONCE(ff->tx_midi_substreams[substream->number], NULL);
 
 	spin_unlock_irqrestore(&ff->lock, flags);
 }
diff --git a/sound/firewire/fireface/ff-transaction.c b/sound/firewire/fireface/ff-transaction.c
index dd6c8e839647..332b29f8ed75 100644
--- a/sound/firewire/fireface/ff-transaction.c
+++ b/sound/firewire/fireface/ff-transaction.c
@@ -12,7 +12,7 @@ static void finish_transmit_midi_msg(struct snd_ff *ff, unsigned int port,
 				     int rcode)
 {
 	struct snd_rawmidi_substream *substream =
-				ACCESS_ONCE(ff->rx_midi_substreams[port]);
+				READ_ONCE(ff->rx_midi_substreams[port]);
 
 	if (rcode_is_permanent_error(rcode)) {
 		ff->rx_midi_error[port] = true;
@@ -60,7 +60,7 @@ static inline void fill_midi_buf(struct snd_ff *ff, unsigned int port,
 static void transmit_midi_msg(struct snd_ff *ff, unsigned int port)
 {
 	struct snd_rawmidi_substream *substream =
-			ACCESS_ONCE(ff->rx_midi_substreams[port]);
+			READ_ONCE(ff->rx_midi_substreams[port]);
 	u8 *buf = (u8 *)ff->msg_buf[port];
 	int i, len;
 
@@ -159,7 +159,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request,
 		 */
 		index = (quad >> 8) & 0xff;
 		if (index > 0) {
-			substream = ACCESS_ONCE(ff->tx_midi_substreams[0]);
+			substream = READ_ONCE(ff->tx_midi_substreams[0]);
 			if (substream != NULL) {
 				byte = quad & 0xff;
 				snd_rawmidi_receive(substream, &byte, 1);
@@ -169,7 +169,7 @@ static void handle_midi_msg(struct fw_card *card, struct fw_request *request,
 		/* Message in second port. */
 		index = (quad >> 24) & 0xff;
 		if (index > 0) {
-			substream = ACCESS_ONCE(ff->tx_midi_substreams[1]);
+			substream = READ_ONCE(ff->tx_midi_substreams[1]);
 			if (substream != NULL) {
 				byte = (quad >> 16) & 0xff;
 				snd_rawmidi_receive(substream, &byte, 1);
diff --git a/sound/firewire/isight.c b/sound/firewire/isight.c
index 5826aa8362f1..46092fa3ff9b 100644
--- a/sound/firewire/isight.c
+++ b/sound/firewire/isight.c
@@ -96,7 +96,7 @@ static void isight_update_pointers(struct isight *isight, unsigned int count)
 	ptr += count;
 	if (ptr >= runtime->buffer_size)
 		ptr -= runtime->buffer_size;
-	ACCESS_ONCE(isight->buffer_pointer) = ptr;
+	WRITE_ONCE(isight->buffer_pointer, ptr);
 
 	isight->period_counter += count;
 	if (isight->period_counter >= runtime->period_size) {
@@ -111,7 +111,7 @@ static void isight_samples(struct isight *isight,
 	struct snd_pcm_runtime *runtime;
 	unsigned int count1;
 
-	if (!ACCESS_ONCE(isight->pcm_running))
+	if (!READ_ONCE(isight->pcm_running))
 		return;
 
 	runtime = isight->pcm->runtime;
@@ -131,7 +131,7 @@ static void isight_samples(struct isight *isight,
 
 static void isight_pcm_abort(struct isight *isight)
 {
-	if (ACCESS_ONCE(isight->pcm_active))
+	if (READ_ONCE(isight->pcm_active))
 		snd_pcm_stop_xrun(isight->pcm);
 }
 
@@ -141,7 +141,7 @@ static void isight_dropped_samples(struct isight *isight, unsigned int total)
 	u32 dropped;
 	unsigned int count1;
 
-	if (!ACCESS_ONCE(isight->pcm_running))
+	if (!READ_ONCE(isight->pcm_running))
 		return;
 
 	runtime = isight->pcm->runtime;
@@ -293,7 +293,7 @@ static int isight_hw_params(struct snd_pcm_substream *substream,
 	if (err < 0)
 		return err;
 
-	ACCESS_ONCE(isight->pcm_active) = true;
+	WRITE_ONCE(isight->pcm_active, true);
 
 	return 0;
 }
@@ -331,7 +331,7 @@ static int isight_hw_free(struct snd_pcm_substream *substream)
 {
 	struct isight *isight = substream->private_data;
 
-	ACCESS_ONCE(isight->pcm_active) = false;
+	WRITE_ONCE(isight->pcm_active, false);
 
 	mutex_lock(&isight->mutex);
 	isight_stop_streaming(isight);
@@ -424,10 +424,10 @@ static int isight_trigger(struct snd_pcm_substream *substream, int cmd)
 
 	switch (cmd) {
 	case SNDRV_PCM_TRIGGER_START:
-		ACCESS_ONCE(isight->pcm_running) = true;
+		WRITE_ONCE(isight->pcm_running, true);
 		break;
 	case SNDRV_PCM_TRIGGER_STOP:
-		ACCESS_ONCE(isight->pcm_running) = false;
+		WRITE_ONCE(isight->pcm_running, false);
 		break;
 	default:
 		return -EINVAL;
@@ -439,7 +439,7 @@ static snd_pcm_uframes_t isight_pointer(struct snd_pcm_substream *substream)
 {
 	struct isight *isight = substream->private_data;
 
-	return ACCESS_ONCE(isight->buffer_pointer);
+	return READ_ONCE(isight->buffer_pointer);
 }
 
 static int isight_create_pcm(struct isight *isight)
diff --git a/sound/firewire/motu/amdtp-motu.c b/sound/firewire/motu/amdtp-motu.c
index 96f0091144bb..f0555a24d90e 100644
--- a/sound/firewire/motu/amdtp-motu.c
+++ b/sound/firewire/motu/amdtp-motu.c
@@ -310,7 +310,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 	if (p->midi_ports)
 		read_midi_messages(s, buffer, data_blocks);
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (data_blocks > 0 && pcm)
 		read_pcm_s32(s, pcm->runtime, buffer, data_blocks);
 
@@ -374,7 +374,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	if (p->midi_ports)
 		write_midi_messages(s, buffer, data_blocks);
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		write_pcm_s32(s, pcm->runtime, buffer, data_blocks);
 	else
diff --git a/sound/firewire/oxfw/oxfw-scs1x.c b/sound/firewire/oxfw/oxfw-scs1x.c
index 02d595665898..f33497cdc706 100644
--- a/sound/firewire/oxfw/oxfw-scs1x.c
+++ b/sound/firewire/oxfw/oxfw-scs1x.c
@@ -112,7 +112,7 @@ static void handle_hss(struct fw_card *card, struct fw_request *request,
 	}
 
 	if (length >= 1) {
-		stream = ACCESS_ONCE(scs->input);
+		stream = READ_ONCE(scs->input);
 		if (stream)
 			midi_input_packet(scs, stream, data, length);
 	}
@@ -183,7 +183,7 @@ static void scs_output_work(struct work_struct *work)
 	if (scs->transaction_running)
 		return;
 
-	stream = ACCESS_ONCE(scs->output);
+	stream = READ_ONCE(scs->output);
 	if (!stream || scs->error) {
 		scs->output_idle = true;
 		wake_up(&scs->idle_wait);
@@ -291,9 +291,9 @@ static void midi_capture_trigger(struct snd_rawmidi_substream *stream, int up)
 
 	if (up) {
 		scs->input_escape_count = 0;
-		ACCESS_ONCE(scs->input) = stream;
+		WRITE_ONCE(scs->input, stream);
 	} else {
-		ACCESS_ONCE(scs->input) = NULL;
+		WRITE_ONCE(scs->input, NULL);
 	}
 }
 
@@ -319,10 +319,10 @@ static void midi_playback_trigger(struct snd_rawmidi_substream *stream, int up)
 		scs->transaction_bytes = 0;
 		scs->error = false;
 
-		ACCESS_ONCE(scs->output) = stream;
+		WRITE_ONCE(scs->output, stream);
 		schedule_work(&scs->work);
 	} else {
-		ACCESS_ONCE(scs->output) = NULL;
+		WRITE_ONCE(scs->output, NULL);
 	}
 }
 static void midi_playback_drain(struct snd_rawmidi_substream *stream)
diff --git a/sound/firewire/tascam/amdtp-tascam.c b/sound/firewire/tascam/amdtp-tascam.c
index 6aff1fc1c72d..ab482423c165 100644
--- a/sound/firewire/tascam/amdtp-tascam.c
+++ b/sound/firewire/tascam/amdtp-tascam.c
@@ -124,7 +124,7 @@ static unsigned int process_tx_data_blocks(struct amdtp_stream *s,
 {
 	struct snd_pcm_substream *pcm;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (data_blocks > 0 && pcm)
 		read_pcm_s32(s, pcm, buffer, data_blocks);
 
@@ -143,7 +143,7 @@ static unsigned int process_rx_data_blocks(struct amdtp_stream *s,
 	/* This field is not used. */
 	*syt = 0x0000;
 
-	pcm = ACCESS_ONCE(s->pcm);
+	pcm = READ_ONCE(s->pcm);
 	if (pcm)
 		write_pcm_s32(s, pcm, buffer, data_blocks);
 	else
diff --git a/sound/firewire/tascam/tascam-transaction.c b/sound/firewire/tascam/tascam-transaction.c
index 8967c52f5032..2ad692dd4b13 100644
--- a/sound/firewire/tascam/tascam-transaction.c
+++ b/sound/firewire/tascam/tascam-transaction.c
@@ -148,7 +148,7 @@ static void async_midi_port_callback(struct fw_card *card, int rcode,
 				     void *callback_data)
 {
 	struct snd_fw_async_midi_port *port = callback_data;
-	struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream);
+	struct snd_rawmidi_substream *substream = READ_ONCE(port->substream);
 
 	/* This port is closed. */
 	if (substream == NULL)
@@ -173,7 +173,7 @@ static void midi_port_work(struct work_struct *work)
 {
 	struct snd_fw_async_midi_port *port =
 			container_of(work, struct snd_fw_async_midi_port, work);
-	struct snd_rawmidi_substream *substream = ACCESS_ONCE(port->substream);
+	struct snd_rawmidi_substream *substream = READ_ONCE(port->substream);
 	int generation;
 
 	/* Under transacting or error state. */
@@ -282,7 +282,7 @@ static void handle_midi_tx(struct fw_card *card, struct fw_request *request,
 				bytes = 3;
 		}
 
-		substream = ACCESS_ONCE(tscm->tx_midi_substreams[port]);
+		substream = READ_ONCE(tscm->tx_midi_substreams[port]);
 		if (substream != NULL)
 			snd_rawmidi_receive(substream, b + 1, bytes);
 	}
diff --git a/sound/soc/xtensa/xtfpga-i2s.c b/sound/soc/xtensa/xtfpga-i2s.c
index 8382ffa3bcaf..2472144b329e 100644
--- a/sound/soc/xtensa/xtfpga-i2s.c
+++ b/sound/soc/xtensa/xtfpga-i2s.c
@@ -165,7 +165,7 @@ static bool xtfpga_pcm_push_tx(struct xtfpga_i2s *i2s)
 	tx_substream = rcu_dereference(i2s->tx_substream);
 	tx_active = tx_substream && snd_pcm_running(tx_substream);
 	if (tx_active) {
-		unsigned tx_ptr = ACCESS_ONCE(i2s->tx_ptr);
+		unsigned tx_ptr = READ_ONCE(i2s->tx_ptr);
 		unsigned new_tx_ptr = i2s->tx_fn(i2s, tx_substream->runtime,
 						 tx_ptr);
 
@@ -437,7 +437,7 @@ static int xtfpga_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 	case SNDRV_PCM_TRIGGER_START:
 	case SNDRV_PCM_TRIGGER_RESUME:
 	case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
-		ACCESS_ONCE(i2s->tx_ptr) = 0;
+		WRITE_ONCE(i2s->tx_ptr, 0);
 		rcu_assign_pointer(i2s->tx_substream, substream);
 		xtfpga_pcm_refill_fifo(i2s);
 		break;
@@ -459,7 +459,7 @@ static snd_pcm_uframes_t xtfpga_pcm_pointer(struct snd_pcm_substream *substream)
 {
 	struct snd_pcm_runtime *runtime = substream->runtime;
 	struct xtfpga_i2s *i2s = runtime->private_data;
-	snd_pcm_uframes_t pos = ACCESS_ONCE(i2s->tx_ptr);
+	snd_pcm_uframes_t pos = READ_ONCE(i2s->tx_ptr);
 
 	return pos < runtime->buffer_size ? pos : 0;
 }
diff --git a/sound/usb/bcd2000/bcd2000.c b/sound/usb/bcd2000/bcd2000.c
index 7371e5b06035..fc579f330601 100644
--- a/sound/usb/bcd2000/bcd2000.c
+++ b/sound/usb/bcd2000/bcd2000.c
@@ -108,7 +108,7 @@ static void bcd2000_midi_handle_input(struct bcd2000 *bcd2k,
 	unsigned int payload_length, tocopy;
 	struct snd_rawmidi_substream *midi_receive_substream;
 
-	midi_receive_substream = ACCESS_ONCE(bcd2k->midi_receive_substream);
+	midi_receive_substream = READ_ONCE(bcd2k->midi_receive_substream);
 	if (!midi_receive_substream)
 		return;
 
@@ -139,7 +139,7 @@ static void bcd2000_midi_send(struct bcd2000 *bcd2k)
 
 	BUILD_BUG_ON(sizeof(device_cmd_prefix) >= BUFSIZE);
 
-	midi_out_substream = ACCESS_ONCE(bcd2k->midi_out_substream);
+	midi_out_substream = READ_ONCE(bcd2k->midi_out_substream);
 	if (!midi_out_substream)
 		return;
 
diff --git a/tools/arch/x86/include/asm/atomic.h b/tools/arch/x86/include/asm/atomic.h
index 328eeceec709..96e2d06cb031 100644
--- a/tools/arch/x86/include/asm/atomic.h
+++ b/tools/arch/x86/include/asm/atomic.h
@@ -24,7 +24,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
diff --git a/tools/include/asm-generic/atomic-gcc.h b/tools/include/asm-generic/atomic-gcc.h
index 5e9738f97bf3..97427e700e3b 100644
--- a/tools/include/asm-generic/atomic-gcc.h
+++ b/tools/include/asm-generic/atomic-gcc.h
@@ -21,7 +21,7 @@
  */
 static inline int atomic_read(const atomic_t *v)
 {
-	return ACCESS_ONCE((v)->counter);
+	return READ_ONCE((v)->counter);
 }
 
 /**
diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h
index 33b5e6cdf38c..d19e11b68de7 100644
--- a/tools/perf/util/auxtrace.h
+++ b/tools/perf/util/auxtrace.h
@@ -378,7 +378,7 @@ struct addr_filters {
 static inline u64 auxtrace_mmap__read_snapshot_head(struct auxtrace_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->userpg;
-	u64 head = ACCESS_ONCE(pc->aux_head);
+	u64 head = READ_ONCE(pc->aux_head);
 
 	/* Ensure all reads are done after we read the head */
 	rmb();
@@ -389,7 +389,7 @@ static inline u64 auxtrace_mmap__read_head(struct auxtrace_mmap *mm)
 {
 	struct perf_event_mmap_page *pc = mm->userpg;
 #if BITS_PER_LONG == 64 || !defined(HAVE_SYNC_COMPARE_AND_SWAP_SUPPORT)
-	u64 head = ACCESS_ONCE(pc->aux_head);
+	u64 head = READ_ONCE(pc->aux_head);
 #else
 	u64 head = __sync_val_compare_and_swap(&pc->aux_head, 0, 0);
 #endif
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 47b5e7dbcb18..aae9645c7122 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -113,7 +113,7 @@ int __perf_session__set_tracepoints_handlers(struct perf_session *session,
 
 extern volatile int session_done;
 
-#define session_done()	ACCESS_ONCE(session_done)
+#define session_done()	READ_ONCE(session_done)
 
 int perf_session__deliver_synth_event(struct perf_session *session,
 				      union perf_event *event,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9deb5a245b83..ce507ae1d4f5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2302,7 +2302,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
 				continue;
 			} else if (pass && i > last_boosted_vcpu)
 				break;
-			if (!ACCESS_ONCE(vcpu->preempted))
+			if (!READ_ONCE(vcpu->preempted))
 				continue;
 			if (vcpu == me)
 				continue;
-- 
cgit v1.3-14-g43fede


From d141babe4244945f1d001118578e0eb3ce12729d Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:00 +0900
Subject: locking/lockdep: Add a boot parameter allowing unwind in
 cross-release and disable it by default

Johan Hovold reported a heavy performance regression caused by lockdep
cross-release:

 > Boot time (from "Linux version" to login prompt) had in fact doubled
 > since 4.13 where it took 17 seconds (with my current config) compared to
 > the 35 seconds I now see with 4.14-rc4.
 >
 > I quick bisect pointed to lockdep and specifically the following commit:
 >
 >	28a903f63ec0 ("locking/lockdep: Handle non(or multi)-acquisition
 >	               of a crosslock")
 >
 > which I've verified is the commit which doubled the boot time (compared
 > to 28a903f63ec0^) (added by lockdep crossrelease series [1]).

Currently cross-release performs unwind on every acquisition, but that
is very expensive.

This patch makes unwind optional and disables it by default and only
records acquire_ip.

Full stack traces are sometimes required for full analysis, in which
case a boot paramter, crossrelease_fullstack, can be specified.

On my qemu Ubuntu machine (x86_64, 4 cores, 512M), the regression was
fixed. We measure boot times with 'perf stat --null --repeat 10 $QEMU',
where $QEMU launches a kernel with init=/bin/true:

1. No lockdep enabled:

 Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs):

       2.756558155 seconds time elapsed                    ( +-  0.09% )

2. Lockdep enabled:

 Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs):

       2.968710420 seconds time elapsed                    ( +-  0.12% )

3. Lockdep enabled + cross-release enabled:

 Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs):

       3.153839636 seconds time elapsed                    ( +-  0.31% )

4. Lockdep enabled + cross-release enabled + this patch applied:

 Performance counter stats for 'qemu_booting_time.sh bzImage' (10 runs):

       2.963669551 seconds time elapsed                    ( +-  0.11% )

I.e. lockdep cross-release performance is now indistinguishable from
vanilla lockdep.

Bisected-by: Johan Hovold <johan@kernel.org>
Analyzed-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Reported-by: Johan Hovold <johan@kernel.org>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-5-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  3 +++
 kernel/locking/lockdep.c                        | 19 +++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 05496622b4ef..f20ed5e0a720 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -709,6 +709,9 @@
 			It will be ignored when crashkernel=X,high is not used
 			or memory reserved is below 4G.
 
+	crossrelease_fullstack
+			[KNL] Allow to record full stack trace in cross-release
+
 	cryptomgr.notests
                         [KNL] Disable crypto self-tests
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e36e652d996f..160b5d6df7cb 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,6 +76,15 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
+static int crossrelease_fullstack;
+static int __init allow_crossrelease_fullstack(char *str)
+{
+	crossrelease_fullstack = 1;
+	return 0;
+}
+
+early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
+
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -4863,8 +4872,14 @@ static void add_xhlock(struct held_lock *hlock)
 	xhlock->trace.nr_entries = 0;
 	xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
 	xhlock->trace.entries = xhlock->trace_entries;
-	xhlock->trace.skip = 3;
-	save_stack_trace(&xhlock->trace);
+
+	if (crossrelease_fullstack) {
+		xhlock->trace.skip = 3;
+		save_stack_trace(&xhlock->trace);
+	} else {
+		xhlock->trace.nr_entries = 1;
+		xhlock->trace.entries[0] = hlock->acquire_ip;
+	}
 }
 
 static inline int same_context_xhlock(struct hist_lock *xhlock)
-- 
cgit v1.3-14-g43fede


From e121d64e16484d4a5eba94cd2fa9eb3848b7c9c2 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:02 +0900
Subject: locking/lockdep: Introduce
 CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK=y

Add a Kconfig knob that enables the lockdep "crossrelease_fullstack" boot parameter.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-7-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c |  4 ++++
 lib/Kconfig.debug        | 15 +++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 160b5d6df7cb..db933d063bfc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,7 +76,11 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
+#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+static int crossrelease_fullstack = 1;
+#else
 static int crossrelease_fullstack;
+#endif
 static int __init allow_crossrelease_fullstack(char *str)
 {
 	crossrelease_fullstack = 1;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c1e720a22c71..2b439a515c30 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1179,6 +1179,21 @@ config LOCKDEP_COMPLETIONS
 	 A deadlock caused by wait_for_completion() and complete() can be
 	 detected by lockdep using crossrelease feature.
 
+config BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+	bool "Enable the boot parameter, crossrelease_fullstack"
+	depends on LOCKDEP_CROSSRELEASE
+	default n
+	help
+	 The lockdep "cross-release" feature needs to record stack traces
+	 (of calling functions) for all acquisitions, for eventual later
+	 use during analysis. By default only a single caller is recorded,
+	 because the unwind operation can be very expensive with deeper
+	 stack chains.
+
+	 However a boot parameter, crossrelease_fullstack, was
+	 introduced since sometimes deeper traces are required for full
+	 analysis. This option turns on the boot parameter.
+
 config DEBUG_LOCKDEP
 	bool "Lock dependency engine debugging"
 	depends on DEBUG_KERNEL && LOCKDEP
-- 
cgit v1.3-14-g43fede


From fd1a5b04dfb899f84ddeb8acdaea6b98283df1e5 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Wed, 25 Oct 2017 17:56:04 +0900
Subject: workqueue: Remove now redundant lock acquisitions wrt. workqueue
 flushes

The workqueue code added manual lock acquisition annotations to catch
deadlocks.

After lockdepcrossrelease was introduced, some of those became redundant,
since wait_for_completion() already does the acquisition and tracking.

Remove the duplicate annotations.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: amir73il@gmail.com
Cc: axboe@kernel.dk
Cc: darrick.wong@oracle.com
Cc: david@fromorbit.com
Cc: hch@infradead.org
Cc: idryomov@gmail.com
Cc: johan@kernel.org
Cc: johannes.berg@intel.com
Cc: kernel-team@lge.com
Cc: linux-block@vger.kernel.org
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-mm@kvack.org
Cc: linux-xfs@vger.kernel.org
Cc: oleg@redhat.com
Cc: tj@kernel.org
Link: http://lkml.kernel.org/r/1508921765-15396-9-git-send-email-byungchul.park@lge.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/workqueue.h |  4 ++--
 kernel/workqueue.c        | 19 +++----------------
 2 files changed, 5 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1c49431f3121..c8a572cb49be 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -218,7 +218,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 									\
 		__init_work((_work), _onstack);				\
 		(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
-		lockdep_init_map(&(_work)->lockdep_map, #_work, &__key, 0); \
+		lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \
 		INIT_LIST_HEAD(&(_work)->entry);			\
 		(_work)->func = (_func);				\
 	} while (0)
@@ -398,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	static struct lock_class_key __key;				\
 	const char *__lock_name;					\
 									\
-	__lock_name = #fmt#args;					\
+	__lock_name = "(wq_completion)"#fmt#args;			\
 									\
 	__alloc_workqueue_key((fmt), (flags), (max_active),		\
 			      &__key, __lock_name, ##args);		\
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 39831b2f3c5f..160fdc6e839a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2497,15 +2497,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
 	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
 	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
 
-	/*
-	 * Explicitly init the crosslock for wq_barrier::done, make its lock
-	 * key a subkey of the corresponding work. As a result we won't
-	 * build a dependency between wq_barrier::done and unrelated work.
-	 */
-	lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
-				   "(complete)wq_barr::done",
-				   target->lockdep_map.key, 1);
-	__init_completion(&barr->done);
+	init_completion_map(&barr->done, &target->lockdep_map);
+
 	barr->task = current;
 
 	/*
@@ -2611,16 +2604,13 @@ void flush_workqueue(struct workqueue_struct *wq)
 	struct wq_flusher this_flusher = {
 		.list = LIST_HEAD_INIT(this_flusher.list),
 		.flush_color = -1,
-		.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+		.done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
 	};
 	int next_color;
 
 	if (WARN_ON(!wq_online))
 		return;
 
-	lock_map_acquire(&wq->lockdep_map);
-	lock_map_release(&wq->lockdep_map);
-
 	mutex_lock(&wq->mutex);
 
 	/*
@@ -2883,9 +2873,6 @@ bool flush_work(struct work_struct *work)
 	if (WARN_ON(!wq_online))
 		return false;
 
-	lock_map_acquire(&work->lockdep_map);
-	lock_map_release(&work->lockdep_map);
-
 	if (start_flush_work(work, &barr)) {
 		wait_for_completion(&barr.done);
 		destroy_work_on_stack(&barr.work);
-- 
cgit v1.3-14-g43fede


From 54b933c6c954a8b7b0c2b40a1c4d3f7279d11e22 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Wed, 25 Oct 2017 19:28:27 +0800
Subject: sched/idle: Micro-optimize the idle loop

Move the loop-invariant calculation of 'cpu' in do_idle() out of the loop body,
because the current CPU is always constant.

This improves the generated code both on x86-64 and ARM64:

x86-64:

Before patch (execution in loop):
	864:       0f ae e8                lfence
	867:       65 8b 05 c2 38 f1 7e    mov %gs:0x7ef138c2(%rip),%eax
	86e:       89 c0                   mov %eax,%eax
	870:       48 0f a3 05 68 19 08    bt  %rax,0x1081968(%rip)
	877:	   01

After patch (execution in loop):
	872:       0f ae e8                lfence
	875:       4c 0f a3 25 63 19 08    bt  %r12,0x1081963(%rip)
	87c:       01

ARM64:

Before patch (execution in loop):
	c58:       d5033d9f        dsb     ld
	c5c:       d538d080        mrs     x0, tpidr_el1
	c60:       b8606a61        ldr     w1, [x19,x0]
	c64:       1100fc20        add     w0, w1, #0x3f
	c68:       7100003f        cmp     w1, #0x0
	c6c:       1a81b000        csel    w0, w0, w1, lt
	c70:       13067c00        asr     w0, w0, #6
	c74:       93407c00        sxtw    x0, w0
	c78:       f8607a80        ldr     x0, [x20,x0,lsl #3]
	c7c:       9ac12401        lsr     x1, x0, x1
	c80:       36000581        tbz     w1, #0, d30 <do_idle+0x128>

After patch (execution in loop):
	c84:       d5033d9f        dsb     ld
	c88:       f9400260        ldr     x0, [x19]
	c8c:       ea14001f        tst     x0, x20
	c90:       54000580        b.eq    d40 <do_idle+0x138>

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
[ Rewrote the title and the changelog. ]
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: huawei.libin@huawei.com
Cc: xiexiuqi@huawei.com
Link: http://lkml.kernel.org/r/1508930907-107755-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b2e8f0afbb42..7dae9eb8c042 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ exit_idle:
  */
 static void do_idle(void)
 {
+	int cpu = smp_processor_id();
 	/*
 	 * If the arch has a polling bit, we maintain an invariant:
 	 *
@@ -225,7 +226,7 @@ static void do_idle(void)
 		check_pgt_cache();
 		rmb();
 
-		if (cpu_is_offline(smp_processor_id())) {
+		if (cpu_is_offline(cpu)) {
 			cpuhp_report_idle_dead();
 			arch_cpu_idle_dead();
 		}
-- 
cgit v1.3-14-g43fede


From 5aaf1ab55389aeb6ce5527580a1a4d4dbc0f41ff Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 Oct 2017 16:56:50 +0200
Subject: livepatch: Correctly call klp_post_unpatch_callback() in error paths

The post_unpatch_enabled flag in struct klp_callbacks is set when a
pre-patch callback successfully executes, indicating that we need to
call a corresponding post-unpatch callback when the patch is reverted.
This is true for ordinary patch disable as well as the error paths of
klp_patch_object() callers.

As currently coded, we inadvertently execute the post-patch callback
twice in klp_module_coming() when klp_patch_object() fails:

  - We explicitly call klp_post_unpatch_callback() for the failed object
  - We call it again for the same object (and all the others) via
    klp_cleanup_module_patches_limited()

We should clear the flag in klp_post_unpatch_callback() to make
sure that the callback is not called twice. It makes the API
more safe.

(We could have removed the callback from the former error path as it
would be covered by the latter call, but I think that is is cleaner to
clear the post_unpatch_enabled after its invoked. For example, someone
might later decide to call the callback only when obj->patched flag is
set.)

There is another mistake in the error path of klp_coming_module() in
which it skips the post-unpatch callback for the klp_transition_patch.
However, the pre-patch callback was called even for this patch, so be
sure to make the corresponding callbacks for all patches.

Finally, I used this opportunity to make klp_pre_patch_callback() more
readable.

[jkosina@suse.cz: incorporate changelog wording changes proposed by Joe Lawrence]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 4 +---
 kernel/livepatch/core.h | 8 +++++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index cafb5a84417d..eb134479c394 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -894,9 +894,7 @@ int klp_module_coming(struct module *mod)
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
 
-				if (patch != klp_transition_patch)
-					klp_post_unpatch_callback(obj);
-
+				klp_post_unpatch_callback(obj);
 				goto err;
 			}
 
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index 6fc907b54e71..cc3aa708e0b4 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -12,10 +12,10 @@ static inline bool klp_is_object_loaded(struct klp_object *obj)
 
 static inline int klp_pre_patch_callback(struct klp_object *obj)
 {
-	int ret;
+	int ret = 0;
 
-	ret = (obj->callbacks.pre_patch) ?
-		(*obj->callbacks.pre_patch)(obj) : 0;
+	if (obj->callbacks.pre_patch)
+		ret = (*obj->callbacks.pre_patch)(obj);
 
 	obj->callbacks.post_unpatch_enabled = !ret;
 
@@ -39,6 +39,8 @@ static inline void klp_post_unpatch_callback(struct klp_object *obj)
 	if (obj->callbacks.post_unpatch_enabled &&
 	    obj->callbacks.post_unpatch)
 		(*obj->callbacks.post_unpatch)(obj);
+
+	obj->callbacks.post_unpatch_enabled = false;
 }
 
 #endif /* _LIVEPATCH_CORE_H */
-- 
cgit v1.3-14-g43fede


From 89a9a1c1c89cea5f70975c338c011b9f7024dee5 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Fri, 20 Oct 2017 16:56:51 +0200
Subject: livepatch: __klp_disable_patch() should never be called for disabled
 patches

__klp_disable_patch() should never be called when the patch is not
enabled. Let's add the same warning that we have in __klp_enable_patch().

This allows to remove the check when calling klp_pre_unpatch_callback().
It was strange anyway because it repeatedly checked per-patch flag
for each patched object.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/livepatch/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index eb134479c394..287f71e9dbfe 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -282,6 +282,9 @@ static int __klp_disable_patch(struct klp_patch *patch)
 {
 	struct klp_object *obj;
 
+	if (WARN_ON(!patch->enabled))
+		return -EINVAL;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -293,7 +296,7 @@ static int __klp_disable_patch(struct klp_patch *patch)
 	klp_init_transition(patch, KLP_UNPATCHED);
 
 	klp_for_each_object(patch, obj)
-		if (patch->enabled && obj->patched)
+		if (obj->patched)
 			klp_pre_unpatch_callback(obj);
 
 	/*
-- 
cgit v1.3-14-g43fede


From d41bf8c9deaed1a90b18d3ffc5639d4c19f0259a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 23 Oct 2017 16:18:27 -0700
Subject: cgroup, sched: Move basic cpu stats from cgroup.stat to cpu.stat

The basic cpu stat is currently shown with "cpu." prefix in
cgroup.stat, and the same information is duplicated in cpu.stat when
cpu controller is enabled.  This is ugly and not very scalable as we
want to expand the coverage of stat information which is always
available.

This patch makes cgroup core always create "cpu.stat" file and show
the basic cpu stat there and calls the cpu controller to show the
extra stats when enabled.  This ensures that the same information
isn't presented in multiple places and makes future expansion of basic
stats easier.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/cgroup-v2.txt     | 15 ++++-------
 include/linux/cgroup-defs.h     |  2 ++
 include/linux/cgroup.h          |  2 --
 kernel/cgroup/cgroup-internal.h |  1 +
 kernel/cgroup/cgroup.c          | 60 +++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup/stat.c            | 10 +++----
 kernel/sched/core.c             | 13 +++------
 7 files changed, 75 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index 0bbdc720dd7c..779211fbb69f 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -886,15 +886,6 @@ All cgroup core files are prefixed with "cgroup."
 		A dying cgroup can consume system resources not exceeding
 		limits, which were active at the moment of cgroup deletion.
 
-	  cpu.usage_usec
-		CPU time consumed in the subtree.
-
-	  cpu.user_usec
-		User CPU time consumed in the subtree.
-
-	  cpu.system_usec
-		System CPU time consumed in the subtree.
-
 
 Controllers
 ===========
@@ -915,12 +906,16 @@ All time durations are in microseconds.
 
   cpu.stat
 	A read-only flat-keyed file which exists on non-root cgroups.
+	This file exists whether the controller is enabled or not.
 
-	It reports the following six stats:
+	It always reports the following three stats:
 
 	- usage_usec
 	- user_usec
 	- system_usec
+
+	and the following three when the controller is enabled:
+
 	- nr_periods
 	- nr_throttled
 	- throttled_usec
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 3e55bbd31ad1..ada6df7b1f55 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -569,6 +569,8 @@ struct cgroup_subsys {
 	void (*css_released)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	void (*css_reset)(struct cgroup_subsys_state *css);
+	int (*css_extra_stat_show)(struct seq_file *seq,
+				   struct cgroup_subsys_state *css);
 
 	int (*can_attach)(struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_taskset *tset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 328a70ce0e23..03cad08b09d1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -703,8 +703,6 @@ static inline void cpuacct_account_field(struct task_struct *tsk, int index,
 					 u64 val) {}
 #endif
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix);
-
 void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
 void __cgroup_account_cputime_field(struct cgroup *cgrp,
 				    enum cpu_usage_stat index, u64 delta_exec);
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index fa642c99586a..4dc317090920 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -205,6 +205,7 @@ int cgroup_task_count(const struct cgroup *cgrp);
 void cgroup_stat_flush(struct cgroup *cgrp);
 int cgroup_stat_init(struct cgroup *cgrp);
 void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
 void cgroup_stat_boot(void);
 
 /*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 7975b20f1fd1..d9773e49a1b4 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -463,6 +463,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 		return &cgrp->self;
 }
 
+/**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+						     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	css = cgroup_css(cgrp, ss);
+	if (!css || !css_tryget_online(css))
+		css = NULL;
+	rcu_read_unlock();
+
+	return css;
+}
+
 /**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -3311,11 +3333,40 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "nr_dying_descendants %d\n",
 		   cgroup->nr_dying_descendants);
 
-	cgroup_stat_show_cputime(seq, "cpu.");
-
 	return 0;
 }
 
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+						 struct cgroup *cgrp, int ssid)
+{
+	struct cgroup_subsys *ss = cgroup_subsys[ssid];
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (!ss->css_extra_stat_show)
+		return 0;
+
+	css = cgroup_tryget_css(cgrp, ss);
+	if (!css)
+		return 0;
+
+	ret = ss->css_extra_stat_show(seq, css);
+	css_put(css);
+	return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	int ret = 0;
+
+	cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+	return ret;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
@@ -4423,6 +4474,11 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
+	{
+		.name = "cpu.stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
 	{ }	/* terminate */
 };
 
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
index 9cce79e89320..133b465691d6 100644
--- a/kernel/cgroup/stat.c
+++ b/kernel/cgroup/stat.c
@@ -256,7 +256,7 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
 	cgroup_cpu_stat_account_end(cgrp, cstat);
 }
 
-void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
+void cgroup_stat_show_cputime(struct seq_file *seq)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	u64 usage, utime, stime;
@@ -278,10 +278,10 @@ void cgroup_stat_show_cputime(struct seq_file *seq, const char *prefix)
 	do_div(utime, NSEC_PER_USEC);
 	do_div(stime, NSEC_PER_USEC);
 
-	seq_printf(seq, "%susage_usec %llu\n"
-		   "%suser_usec %llu\n"
-		   "%ssystem_usec %llu\n",
-		   prefix, usage, prefix, utime, prefix, stime);
+	seq_printf(seq, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, utime, stime);
 }
 
 int cgroup_stat_init(struct cgroup *cgrp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad255162a830..0b3eec389552 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6678,13 +6678,12 @@ static struct cftype cpu_legacy_files[] = {
 	{ }	/* Terminate */
 };
 
-static int cpu_stat_show(struct seq_file *sf, void *v)
+static int cpu_extra_stat_show(struct seq_file *sf,
+			       struct cgroup_subsys_state *css)
 {
-	cgroup_stat_show_cputime(sf, "");
-
 #ifdef CONFIG_CFS_BANDWIDTH
 	{
-		struct task_group *tg = css_tg(seq_css(sf));
+		struct task_group *tg = css_tg(css);
 		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
 		u64 throttled_usec;
 
@@ -6817,11 +6816,6 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
 #endif
 
 static struct cftype cpu_files[] = {
-	{
-		.name = "stat",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cpu_stat_show,
-	},
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "weight",
@@ -6852,6 +6846,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
+	.css_extra_stat_show = cpu_extra_stat_show,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-- 
cgit v1.3-14-g43fede


From 7863406143d8bbbbda07a61285c5f4c217908dfd Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:28 +0200
Subject: sched/isolation: Move housekeeping related code to its own file

The housekeeping code is currently tied to the NOHZ code. As we are
planning to make housekeeping independent from it, start with moving
the relevant code to its own file.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-2-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  2 +-
 include/linux/sched/isolation.h    | 56 ++++++++++++++++++++++++++++++++++++++
 include/linux/tick.h               | 37 -------------------------
 init/main.c                        |  2 ++
 kernel/rcu/tree_plugin.h           |  1 +
 kernel/rcu/update.c                |  1 +
 kernel/sched/Makefile              |  1 +
 kernel/sched/core.c                |  1 +
 kernel/sched/fair.c                |  1 +
 kernel/sched/isolation.c           | 33 ++++++++++++++++++++++
 kernel/time/tick-sched.c           | 18 ------------
 kernel/watchdog.c                  |  1 +
 12 files changed, 98 insertions(+), 56 deletions(-)
 create mode 100644 include/linux/sched/isolation.h
 create mode 100644 kernel/sched/isolation.c

(limited to 'kernel')

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index c00102b8145a..27a3272dcd48 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
 #include <linux/tcp.h>
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
-#include <linux/tick.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/checksum.h>
 #include <asm/homecache.h>
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
new file mode 100644
index 000000000000..b7cfbc46286c
--- /dev/null
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,56 @@
+#ifndef _LINUX_SCHED_ISOLATION_H
+#define _LINUX_SCHED_ISOLATION_H
+
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_NO_HZ_FULL
+extern cpumask_var_t housekeeping_mask;
+
+static inline int housekeeping_any_cpu(void)
+{
+	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+}
+
+extern void __init housekeeping_init(void);
+
+#else
+
+static inline int housekeeping_any_cpu(void)
+{
+	return smp_processor_id();
+}
+
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
+
+static inline const struct cpumask *housekeeping_cpumask(void)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+#endif
+	return cpu_possible_mask;
+}
+
+static inline bool is_housekeeping_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+#endif
+	return true;
+}
+
+static inline void housekeeping_affine(struct task_struct *t)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+
+#endif
+}
+
+#endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index fe01e68bf520..68afc09aa8ac 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -137,7 +137,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 #ifdef CONFIG_NO_HZ_FULL
 extern bool tick_nohz_full_running;
 extern cpumask_var_t tick_nohz_full_mask;
-extern cpumask_var_t housekeeping_mask;
 
 static inline bool tick_nohz_full_enabled(void)
 {
@@ -161,11 +160,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 		cpumask_or(mask, mask, tick_nohz_full_mask);
 }
 
-static inline int housekeeping_any_cpu(void)
-{
-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-}
-
 extern void tick_nohz_dep_set(enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
 extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@@ -235,10 +229,6 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
 #else
-static inline int housekeeping_any_cpu(void)
-{
-	return smp_processor_id();
-}
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@@ -260,33 +250,6 @@ static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
 #endif
 
-static inline const struct cpumask *housekeeping_cpumask(void)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
-	return cpu_possible_mask;
-}
-
-static inline bool is_housekeeping_cpu(int cpu)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-#endif
-	return true;
-}
-
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 static inline void tick_nohz_task_switch(void)
 {
 	if (tick_nohz_full_enabled())
diff --git a/init/main.c b/init/main.c
index 0ee9c6866ada..4610c99ae306 100644
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
 #include <linux/cgroup.h>
 #include <linux/efi.h>
 #include <linux/tick.h>
+#include <linux/sched/isolation.h>
 #include <linux/interrupt.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 	early_irq_init();
 	init_IRQ();
 	tick_init();
+	housekeeping_init();
 	rcu_init_nohz();
 	init_timers();
 	hrtimers_init();
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..c7632f51c5a0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
 #include <linux/oom.h>
 #include <linux/sched/debug.h>
 #include <linux/smpboot.h>
+#include <linux/sched/isolation.h>
 #include <uapi/linux/sched/types.h>
 #include "../time/tick-internal.h"
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5033b66d2753..79abeb0ca329 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
 #include <linux/kthread.h>
 #include <linux/tick.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/sched/isolation.h>
 
 #define CREATE_TRACE_POINTS
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 78f54932ea1d..871d43d73375 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_NO_HZ_FULL) += isolation.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2288a145bf01..ad188acb7636 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
 #include <linux/profile.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56f343b8e749..591481db8c6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,6 +32,7 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/sched/isolation.h>
 
 #include <trace/events/sched.h>
 
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
new file mode 100644
index 000000000000..3589252ed476
--- /dev/null
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,33 @@
+/*
+ *  Housekeeping management. Manage the targets for routine code that can run on
+ *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
+ *
+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ *
+ */
+
+#include <linux/sched/isolation.h>
+#include <linux/tick.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+
+cpumask_var_t housekeeping_mask;
+
+void __init housekeeping_init(void)
+{
+	if (!tick_nohz_full_enabled())
+		return;
+
+	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+		cpumask_clear(tick_nohz_full_mask);
+		tick_nohz_full_running = false;
+		return;
+	}
+
+	cpumask_andnot(housekeeping_mask,
+		       cpu_possible_mask, tick_nohz_full_mask);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b258c59d78a..27d7d522ac4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -166,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 
 #ifdef CONFIG_NO_HZ_FULL
 cpumask_var_t tick_nohz_full_mask;
-cpumask_var_t housekeeping_mask;
 bool tick_nohz_full_running;
 static atomic_t tick_dep_mask;
 
@@ -438,13 +437,6 @@ void __init tick_nohz_init(void)
 			return;
 	}
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
-	}
-
 	/*
 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
 	 * locking contexts. But then we need irq work to raise its own
@@ -453,7 +445,6 @@ void __init tick_nohz_init(void)
 	if (!arch_irq_work_has_interrupt()) {
 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
 		cpumask_clear(tick_nohz_full_mask);
-		cpumask_copy(housekeeping_mask, cpu_possible_mask);
 		tick_nohz_full_running = false;
 		return;
 	}
@@ -466,9 +457,6 @@ void __init tick_nohz_init(void)
 		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
-
 	for_each_cpu(cpu, tick_nohz_full_mask)
 		context_tracking_cpu_set(cpu);
 
@@ -478,12 +466,6 @@ void __init tick_nohz_init(void)
 	WARN_ON(ret < 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
-
-	/*
-	 * We need at least one CPU to handle housekeeping work such
-	 * as timekeeping, unbound timers, workqueues, ...
-	 */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6bcb854909c0..3c44dbaa4b9f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
 
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-- 
cgit v1.3-14-g43fede


From 13316b31fdaaa45f06793eb7992588359ba6ab9f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:29 +0200
Subject: sched/isolation, watchdog: Use housekeeping_cpumask() instead of
 ad-hoc version

While trying to disable the watchog on nohz_full CPUs, the watchdog
implements an ad-hoc version of housekeeping_cpumask(). Lets replace
those re-invented lines.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/watchdog.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 3c44dbaa4b9f..562652c9c815 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -774,15 +774,10 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
 void __init lockup_detector_init(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled()) {
+	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
-		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
-	} else
-		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#else
-	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
-#endif
+
+	cpumask_copy(&watchdog_cpumask, housekeeping_cpumask());
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-- 
cgit v1.3-14-g43fede


From 7e56a1cf4b28f5739526877b8dbad623fae2e4e7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:31 +0200
Subject: sched/isolation: Make the housekeeping cpumask private

Nobody needs to access this detail. housekeeping_cpumask() already
takes care of it.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 31 ++++++++++---------------------
 kernel/sched/isolation.c        | 36 +++++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 040df04fa78a..ed935ffc6ffa 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,46 +6,35 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
-extern cpumask_var_t housekeeping_mask;
+extern int housekeeping_any_cpu(void);
+extern const struct cpumask *housekeeping_cpumask(void);
+extern void housekeeping_affine(struct task_struct *t);
+extern bool housekeeping_test_cpu(int cpu);
 extern void __init housekeeping_init(void);
+
 #else
-static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
 
 static inline int housekeeping_any_cpu(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-#endif
 	return smp_processor_id();
 }
 
 static inline const struct cpumask *housekeeping_cpumask(void)
 {
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		return housekeeping_mask;
-#endif
 	return cpu_possible_mask;
 }
 
+static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_init(void) { }
+#endif /* CONFIG_NO_HZ_FULL */
+
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_enabled())
-		return cpumask_test_cpu(cpu, housekeeping_mask);
+		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
 }
 
-static inline void housekeeping_affine(struct task_struct *t)
-{
-#ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
-		set_cpus_allowed_ptr(t, housekeeping_mask);
-
-#endif
-}
-
 #endif /* _LINUX_SCHED_ISOLATION_H */
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 3589252ed476..16445097eb25 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -11,7 +11,41 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 
-cpumask_var_t housekeeping_mask;
+static cpumask_var_t housekeeping_mask;
+
+int housekeeping_any_cpu(void)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+
+	return smp_processor_id();
+}
+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
+
+const struct cpumask *housekeeping_cpumask(void)
+{
+	if (tick_nohz_full_enabled())
+		return housekeeping_mask;
+
+	return cpu_possible_mask;
+}
+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
+
+void housekeeping_affine(struct task_struct *t)
+{
+	if (tick_nohz_full_enabled())
+		set_cpus_allowed_ptr(t, housekeeping_mask);
+}
+EXPORT_SYMBOL_GPL(housekeeping_affine);
+
+bool housekeeping_test_cpu(int cpu)
+{
+	if (tick_nohz_full_enabled())
+		return cpumask_test_cpu(cpu, housekeeping_mask);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-- 
cgit v1.3-14-g43fede


From e179f5a04ba46ee5c5439480c2bfd68c358168b7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:32 +0200
Subject: sched/isolation: Use its own static key

Housekeeping code still depends on the nohz_full static key. Since we want
to decouple housekeeping from NOHZ, let's create a housekeeping specific
static key.

It's mostly relevant for calls to is_housekeeping_cpu() from the scheduler.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  3 ++-
 kernel/sched/isolation.c        | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ed935ffc6ffa..194c586fbb12 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -6,6 +6,7 @@
 #include <linux/tick.h>
 
 #ifdef CONFIG_NO_HZ_FULL
+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
 extern void housekeeping_affine(struct task_struct *t);
@@ -31,7 +32,7 @@ static inline void housekeeping_init(void) { }
 static inline bool is_housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
 	return true;
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 16445097eb25..bb8ba19a0235 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -10,12 +10,15 @@
 #include <linux/tick.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/static_key.h>
 
+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
+EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
 
 int housekeeping_any_cpu(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 
 	return smp_processor_id();
@@ -24,7 +27,7 @@ EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
 const struct cpumask *housekeeping_cpumask(void)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_mask;
 
 	return cpu_possible_mask;
@@ -33,14 +36,14 @@ EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
 void housekeeping_affine(struct task_struct *t)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
 bool housekeeping_test_cpu(int cpu)
 {
-	if (tick_nohz_full_enabled())
+	if (static_branch_unlikely(&housekeeping_overriden))
 		return cpumask_test_cpu(cpu, housekeeping_mask);
 
 	return true;
@@ -62,6 +65,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	static_branch_enable(&housekeeping_overriden);
+
 	/* We need at least one CPU to handle housekeeping work */
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
-- 
cgit v1.3-14-g43fede


From 204c083a009378dfa751175b5fcddc75988bab6c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:33 +0200
Subject: sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()

Fit it into the housekeeping_*() namespace.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-7-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 2 +-
 kernel/sched/core.c             | 6 +++---
 kernel/sched/fair.c             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 194c586fbb12..ad0f5d986a2e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -29,7 +29,7 @@ static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_NO_HZ_FULL */
 
-static inline bool is_housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (static_branch_unlikely(&housekeeping_overriden))
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad188acb7636..d0fb448dd43a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,14 +536,14 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		cpu = housekeeping_any_cpu();
 unlock:
 	rcu_read_unlock();
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 591481db8c6a..cdece8f967f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!is_housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-- 
cgit v1.3-14-g43fede


From 5c4991e24c69737bd41fc2737b1e3980abbf73f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:34 +0200
Subject: sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from
 CONFIG_NO_HZ_FULL

Split the housekeeping config from CONFIG_NO_HZ_FULL. This way we finally
separate the isolation code from NOHZ.

Although a dependency to CONFIG_NO_HZ_FULL remains for now, while the
housekeeping code still deals with NOHZ internals.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-8-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h | 6 +++---
 init/Kconfig                    | 8 ++++++++
 kernel/sched/Makefile           | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index ad0f5d986a2e..93ac2367a520 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,7 +5,7 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
 extern int housekeeping_any_cpu(void);
 extern const struct cpumask *housekeeping_cpumask(void);
@@ -27,11 +27,11 @@ static inline const struct cpumask *housekeeping_cpumask(void)
 
 static inline void housekeeping_affine(struct task_struct *t) { }
 static inline void housekeeping_init(void) { }
-#endif /* CONFIG_NO_HZ_FULL */
+#endif /* CONFIG_CPU_ISOLATION */
 
 static inline bool housekeeping_cpu(int cpu)
 {
-#ifdef CONFIG_NO_HZ_FULL
+#ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
 		return housekeeping_test_cpu(cpu);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..6f52e6f4bd9d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,14 @@ config TASK_IO_ACCOUNTING
 
 endmenu # "CPU/Task time and stats accounting"
 
+config CPU_ISOLATION
+	bool "CPU isolation"
+	depends on NO_HZ_FULL
+	help
+	  Make sure that CPUs running critical tasks are not disturbed by
+	  any source of "noise" such as unbound workqueues, timers, kthreads...
+	  Unbound jobs get offloaded to housekeeping CPUs.
+
 source "kernel/rcu/Kconfig"
 
 config BUILD_BIN2C
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 871d43d73375..dbe61302c352 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,4 +26,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_NO_HZ_FULL) += isolation.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-- 
cgit v1.3-14-g43fede


From de201559df872f83d0c08fb4effe3efd28e6cbc8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:35 +0200
Subject: sched/isolation: Introduce housekeeping flags

Before we implement isolcpus under housekeeping, we need the isolation
features to be more finegrained. For example some people want NOHZ_FULL
without the full scheduler isolation, others want full scheduler
isolation without NOHZ_FULL.

So let's cut all these isolation features piecewise, at the risk of
overcutting it right now. We can still merge some flags later if they
always make sense together.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-9-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/net/ethernet/tile/tilegx.c |  4 ++--
 include/linux/sched/isolation.h    | 26 +++++++++++++++++---------
 kernel/rcu/tree_plugin.h           |  2 +-
 kernel/rcu/update.c                |  2 +-
 kernel/sched/core.c                |  8 ++++----
 kernel/sched/fair.c                |  2 +-
 kernel/sched/isolation.c           | 26 +++++++++++++++-----------
 kernel/watchdog.c                  |  3 ++-
 8 files changed, 43 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
index 27a3272dcd48..b3e5816a4678 100644
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
 		tile_net_dev_init(name, mac);
 
 	if (!network_cpus_init())
-		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
-			    cpu_online_mask);
+		cpumask_and(&network_cpus_map,
+			    housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
 
 	return 0;
 }
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 93ac2367a520..9bb753eece3b 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -5,35 +5,43 @@
 #include <linux/init.h>
 #include <linux/tick.h>
 
+enum hk_flags {
+	HK_FLAG_TIMER		= 1,
+	HK_FLAG_RCU		= (1 << 1),
+	HK_FLAG_MISC		= (1 << 2),
+	HK_FLAG_SCHED		= (1 << 3),
+};
+
 #ifdef CONFIG_CPU_ISOLATION
 DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
-extern int housekeeping_any_cpu(void);
-extern const struct cpumask *housekeeping_cpumask(void);
-extern void housekeeping_affine(struct task_struct *t);
-extern bool housekeeping_test_cpu(int cpu);
+extern int housekeeping_any_cpu(enum hk_flags flags);
+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
 extern void __init housekeeping_init(void);
 
 #else
 
-static inline int housekeeping_any_cpu(void)
+static inline int housekeeping_any_cpu(enum hk_flags flags)
 {
 	return smp_processor_id();
 }
 
-static inline const struct cpumask *housekeeping_cpumask(void)
+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	return cpu_possible_mask;
 }
 
-static inline void housekeeping_affine(struct task_struct *t) { }
+static inline void housekeeping_affine(struct task_struct *t,
+				       enum hk_flags flags) { }
 static inline void housekeeping_init(void) { }
 #endif /* CONFIG_CPU_ISOLATION */
 
-static inline bool housekeeping_cpu(int cpu)
+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
 {
 #ifdef CONFIG_CPU_ISOLATION
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_test_cpu(cpu);
+		return housekeeping_test_cpu(cpu, flags);
 #endif
 	return true;
 }
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c7632f51c5a0..34125d23e58a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2584,7 +2584,7 @@ static void rcu_bind_gp_kthread(void)
 
 	if (!tick_nohz_full_enabled())
 		return;
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 }
 
 /* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 79abeb0ca329..e3e60efaafee 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -719,7 +719,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 	LIST_HEAD(rcu_tasks_holdouts);
 
 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
-	housekeeping_affine(current);
+	housekeeping_affine(current, HK_FLAG_RCU);
 
 	/*
 	 * Each pass through the following loop makes one check for
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d0fb448dd43a..2210c0203e51 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -527,7 +527,7 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu) && housekeeping_cpu(cpu))
+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
 		return cpu;
 
 	rcu_read_lock();
@@ -536,15 +536,15 @@ int get_nohz_timer_target(void)
 			if (cpu == i)
 				continue;
 
-			if (!idle_cpu(i) && housekeeping_cpu(i)) {
+			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
 
-	if (!housekeeping_cpu(cpu))
-		cpu = housekeeping_any_cpu();
+	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
 unlock:
 	rcu_read_unlock();
 	return cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cdece8f967f0..f755de86a813 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9027,7 +9027,7 @@ void nohz_balance_enter_idle(int cpu)
 		return;
 
 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
-	if (!housekeeping_cpu(cpu))
+	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
 		return;
 
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index bb8ba19a0235..37a138a780ff 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -15,37 +15,39 @@
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
 static cpumask_var_t housekeeping_mask;
+static unsigned int housekeeping_flags;
 
-int housekeeping_any_cpu(void)
+int housekeeping_any_cpu(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_any_and(housekeeping_mask, cpu_online_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
 	return smp_processor_id();
 }
 EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
 
-const struct cpumask *housekeeping_cpumask(void)
+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return housekeeping_mask;
-
+		if (housekeeping_flags & flags)
+			return housekeeping_mask;
 	return cpu_possible_mask;
 }
 EXPORT_SYMBOL_GPL(housekeeping_cpumask);
 
-void housekeeping_affine(struct task_struct *t)
+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		set_cpus_allowed_ptr(t, housekeeping_mask);
+		if (housekeeping_flags & flags)
+			set_cpus_allowed_ptr(t, housekeeping_mask);
 }
 EXPORT_SYMBOL_GPL(housekeeping_affine);
 
-bool housekeeping_test_cpu(int cpu)
+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
 {
 	if (static_branch_unlikely(&housekeeping_overriden))
-		return cpumask_test_cpu(cpu, housekeeping_mask);
-
+		if (housekeeping_flags & flags)
+			return cpumask_test_cpu(cpu, housekeeping_mask);
 	return true;
 }
 EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
@@ -65,6 +67,8 @@ void __init housekeeping_init(void)
 	cpumask_andnot(housekeeping_mask,
 		       cpu_possible_mask, tick_nohz_full_mask);
 
+	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
 	static_branch_enable(&housekeeping_overriden);
 
 	/* We need at least one CPU to handle housekeeping work */
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 562652c9c815..e9e2ebb3db29 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -777,7 +777,8 @@ void __init lockup_detector_init(void)
 	if (tick_nohz_full_enabled())
 		pr_info("Disabling watchdog on nohz_full cores by default\n");
 
-	cpumask_copy(&watchdog_cpumask, housekeeping_cpumask());
+	cpumask_copy(&watchdog_cpumask,
+		     housekeeping_cpumask(HK_FLAG_TIMER));
 
 	if (!watchdog_nmi_probe())
 		nmi_watchdog_available = true;
-- 
cgit v1.3-14-g43fede


From 6f1982fedd59856bcc42a9b521be4c3ffd2f60a7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:36 +0200
Subject: sched/isolation: Handle the nohz_full= parameter

We want to centralize the isolation management, done by the housekeeping
subsystem. Therefore we need to handle the nohz_full= parameter from
there.

Since nohz_full= so far has involved unbound timers, watchdog, RCU
and tilegx NAPI isolation, we keep that default behaviour.

nohz_full= will be deprecated in the future. We want to control
the isolation features from the isolcpus= parameter.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-10-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/isolation.h |  1 +
 include/linux/tick.h            |  2 ++
 init/Kconfig                    |  1 -
 kernel/sched/isolation.c        | 42 +++++++++++++++++++++++++++++------------
 kernel/time/tick-sched.c        | 13 +++----------
 5 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 9bb753eece3b..e53cfa96e91e 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -10,6 +10,7 @@ enum hk_flags {
 	HK_FLAG_RCU		= (1 << 1),
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
+	HK_FLAG_TICK		= (1 << 4),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 68afc09aa8ac..e2a163a9f96c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -228,6 +228,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 extern void tick_nohz_full_kick_cpu(int cpu);
 extern void __tick_nohz_task_switch(void);
+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
 #else
 static inline bool tick_nohz_full_enabled(void) { return false; }
 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
@@ -248,6 +249,7 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
 static inline void tick_nohz_full_kick_cpu(int cpu) { }
 static inline void __tick_nohz_task_switch(void) { }
+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
 #endif
 
 static inline void tick_nohz_task_switch(void)
diff --git a/init/Kconfig b/init/Kconfig
index 6f52e6f4bd9d..f8564df58d0a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -474,7 +474,6 @@ endmenu # "CPU/Task time and stats accounting"
 
 config CPU_ISOLATION
 	bool "CPU isolation"
-	depends on NO_HZ_FULL
 	help
 	  Make sure that CPUs running critical tasks are not disturbed by
 	  any source of "noise" such as unbound workqueues, timers, kthreads...
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 37a138a780ff..1f61e440358d 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -54,23 +54,41 @@ EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
 
 void __init housekeeping_init(void)
 {
-	if (!tick_nohz_full_enabled())
+	if (!housekeeping_flags)
 		return;
 
-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
-		cpumask_clear(tick_nohz_full_mask);
-		tick_nohz_full_running = false;
-		return;
+	static_branch_enable(&housekeeping_overriden);
+
+	/* We need at least one CPU to handle housekeeping work */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	cpumask_var_t non_housekeeping_mask;
+
+	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
+	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
+		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+		free_bootmem_cpumask_var(non_housekeeping_mask);
+		return 0;
 	}
 
-	cpumask_andnot(housekeeping_mask,
-		       cpu_possible_mask, tick_nohz_full_mask);
+	alloc_bootmem_cpumask_var(&housekeeping_mask);
+	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
 
-	housekeeping_flags = HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+	if (cpumask_empty(housekeeping_mask))
+		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
 
-	static_branch_enable(&housekeeping_overriden);
+	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
+				HK_FLAG_RCU | HK_FLAG_MISC;
 
-	/* We need at least one CPU to handle housekeeping work */
-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+	tick_nohz_full_setup(non_housekeeping_mask);
+
+	free_bootmem_cpumask_var(non_housekeeping_mask);
+
+	return 1;
 }
+__setup("nohz_full=", housekeeping_nohz_full_setup);
+#endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 27d7d522ac4e..69f3dbe38984 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -385,20 +385,13 @@ out:
 	local_irq_restore(flags);
 }
 
-/* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_full_setup(char *str)
+/* Get the boot-time nohz CPU list from the kernel parameters. */
+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 {
 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
-	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
-		pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
-		free_bootmem_cpumask_var(tick_nohz_full_mask);
-		return 1;
-	}
+	cpumask_copy(tick_nohz_full_mask, cpumask);
 	tick_nohz_full_running = true;
-
-	return 1;
 }
-__setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
-- 
cgit v1.3-14-g43fede


From edb9382175c3ebdced8ffdb3e0f20052ad9fdbe9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:37 +0200
Subject: sched/isolation: Move isolcpus= handling to the housekeeping code

We want to centralize the isolation features, to be done by the housekeeping
subsystem and scheduler domain isolation is a significant part of it.

No intended behaviour change, we just reuse the housekeeping cpumask
and core code.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-11-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/base/cpu.c              | 11 ++++++-
 include/linux/sched.h           |  2 --
 include/linux/sched/isolation.h |  1 +
 kernel/cgroup/cpuset.c          | 15 ++++------
 kernel/sched/core.c             | 16 +----------
 kernel/sched/isolation.c        | 63 ++++++++++++++++++++++++++++++++---------
 kernel/sched/topology.c         | 24 ++++------------
 7 files changed, 73 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 321cd7b4d817..a73ab95558f5 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
 #include <linux/cpufeature.h>
 #include <linux/tick.h>
 #include <linux/pm_qos.h>
+#include <linux/sched/isolation.h>
 
 #include "base.h"
 
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	int n = 0, len = PAGE_SIZE-2;
+	cpumask_var_t isolated;
 
-	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
+	if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_andnot(isolated, cpu_possible_mask,
+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
+	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
+
+	free_cpumask_var(isolated);
 
 	return n;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0f897dfc195e..1b0cc0d6df8d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -165,8 +165,6 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN			16
 
-extern cpumask_var_t			cpu_isolated_map;
-
 extern void scheduler_tick(void);
 
 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index e53cfa96e91e..d849431c8060 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -11,6 +11,7 @@ enum hk_flags {
 	HK_FLAG_MISC		= (1 << 2),
 	HK_FLAG_SCHED		= (1 << 3),
 	HK_FLAG_TICK		= (1 << 4),
+	HK_FLAG_DOMAIN		= (1 << 5),
 };
 
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e2924ecb..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
 #include <linux/oom.h>
-
+#include <linux/sched/isolation.h>
 #include <linux/uaccess.h>
 #include <linux/atomic.h>
 #include <linux/mutex.h>
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	int csn;		/* how many cpuset ptrs in csa so far */
 	int i, j, k;		/* indices for partition finding loops */
 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
 	int ndoms = 0;		/* number of sched domains in result */
 	int nslot;		/* next empty doms[] struct cpumask slot */
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 	dattr = NULL;
 	csa = NULL;
 
-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
-		goto done;
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
 	/* Special case for the 99% of systems with one, full, sched domain */
 	if (is_sched_load_balance(&top_cpuset)) {
 		ndoms = 1;
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 			update_domain_attr_tree(dattr, &top_cpuset);
 		}
 		cpumask_and(doms[0], top_cpuset.effective_cpus,
-				     non_isolated_cpus);
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 
 		goto done;
 	}
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 		 */
 		if (!cpumask_empty(cp->cpus_allowed) &&
 		    !(is_sched_load_balance(cp) &&
-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+		      cpumask_intersects(cp->cpus_allowed,
+					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
 			continue;
 
 		if (is_sched_load_balance(cp))
@@ -789,7 +785,7 @@ restart:
 
 			if (apn == b->pn) {
 				cpumask_or(dp, dp, b->effective_cpus);
-				cpumask_and(dp, dp, non_isolated_cpus);
+				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
 				if (dattr)
 					update_domain_attr_tree(dattr + nslot, b);
 
@@ -802,7 +798,6 @@ restart:
 	BUG_ON(nslot != ndoms);
 
 done:
-	free_cpumask_var(non_isolated_cpus);
 	kfree(csa);
 
 	/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2210c0203e51..1a55c842bfbc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -84,9 +84,6 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
@@ -5735,10 +5732,6 @@ static inline void sched_init_smt(void) { }
 
 void __init sched_init_smp(void)
 {
-	cpumask_var_t non_isolated_cpus;
-
-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
 	sched_init_numa();
 
 	/*
@@ -5748,16 +5741,12 @@ void __init sched_init_smp(void)
 	 */
 	mutex_lock(&sched_domains_mutex);
 	sched_init_domains(cpu_active_mask);
-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-	if (cpumask_empty(non_isolated_cpus))
-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
 	mutex_unlock(&sched_domains_mutex);
 
 	/* Move init over to a non-isolated CPU */
-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
 		BUG();
 	sched_init_granularity();
-	free_cpumask_var(non_isolated_cpus);
 
 	init_sched_rt_class();
 	init_sched_dl_class();
@@ -5961,9 +5950,6 @@ void __init sched_init(void)
 	calc_load_update = jiffies + LOAD_FREQ;
 
 #ifdef CONFIG_SMP
-	/* May be allocated at isolcpus cmdline parse time */
-	if (cpu_isolated_map == NULL)
-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 	idle_thread_set_boot_cpu();
 	set_cpu_rq_start_time(smp_processor_id());
 #endif
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 1f61e440358d..8f666bc5abe8 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -63,32 +63,69 @@ void __init housekeeping_init(void)
 	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 
-#ifdef CONFIG_NO_HZ_FULL
-static int __init housekeeping_nohz_full_setup(char *str)
+static int __init housekeeping_setup(char *str, enum hk_flags flags)
 {
 	cpumask_var_t non_housekeeping_mask;
+	int err;
 
 	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
-	if (cpulist_parse(str, non_housekeeping_mask) < 0) {
-		pr_warn("Housekeeping: Incorrect nohz_full cpumask\n");
+	err = cpulist_parse(str, non_housekeeping_mask);
+	if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
+		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
 		free_bootmem_cpumask_var(non_housekeeping_mask);
 		return 0;
 	}
 
-	alloc_bootmem_cpumask_var(&housekeeping_mask);
-	cpumask_andnot(housekeeping_mask, cpu_possible_mask, non_housekeeping_mask);
-
-	if (cpumask_empty(housekeeping_mask))
-		cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	if (!housekeeping_flags) {
+		alloc_bootmem_cpumask_var(&housekeeping_mask);
+		cpumask_andnot(housekeeping_mask,
+			       cpu_possible_mask, non_housekeeping_mask);
+		if (cpumask_empty(housekeeping_mask))
+			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+	} else {
+		cpumask_var_t tmp;
+
+		alloc_bootmem_cpumask_var(&tmp);
+		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
+		if (!cpumask_equal(tmp, housekeeping_mask)) {
+			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+			free_bootmem_cpumask_var(tmp);
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+		free_bootmem_cpumask_var(tmp);
+	}
 
-	housekeeping_flags = HK_FLAG_TICK | HK_FLAG_TIMER |
-				HK_FLAG_RCU | HK_FLAG_MISC;
+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
+		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+			tick_nohz_full_setup(non_housekeeping_mask);
+		} else {
+			pr_warn("Housekeeping: nohz unsupported."
+				" Build with CONFIG_NO_HZ_FULL\n");
+			free_bootmem_cpumask_var(non_housekeeping_mask);
+			return 0;
+		}
+	}
 
-	tick_nohz_full_setup(non_housekeeping_mask);
+	housekeeping_flags |= flags;
 
 	free_bootmem_cpumask_var(non_housekeeping_mask);
 
 	return 1;
 }
+
+static int __init housekeeping_nohz_full_setup(char *str)
+{
+	unsigned int flags;
+
+	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+
+	return housekeeping_setup(str, flags);
+}
 __setup("nohz_full=", housekeeping_nohz_full_setup);
-#endif
+
+static int __init housekeeping_isolcpus_setup(char *str)
+{
+	return housekeeping_setup(str, HK_FLAG_DOMAIN);
+}
+__setup("isolcpus=", housekeeping_isolcpus_setup);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e3d31b0880dc..2e6b9126ffdc 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sched.h>
 #include <linux/mutex.h>
+#include <linux/sched/isolation.h>
 
 #include "sched.h"
 
@@ -469,21 +470,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 	update_top_cache_domain(cpu);
 }
 
-/* Setup the mask of CPUs configured for isolated domains */
-static int __init isolated_cpu_setup(char *str)
-{
-	int ret;
-
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
-	ret = cpulist_parse(str, cpu_isolated_map);
-	if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u - ignoring them.\n", nr_cpu_ids-1);
-		return 0;
-	}
-	return 1;
-}
-__setup("isolcpus=", isolated_cpu_setup);
-
 struct s_data {
 	struct sched_domain ** __percpu sd;
 	struct root_domain	*rd;
@@ -1792,7 +1778,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 	doms_cur = alloc_sched_domains(ndoms_cur);
 	if (!doms_cur)
 		doms_cur = &fallback_doms;
-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
 	err = build_sched_domains(doms_cur[0], NULL);
 	register_sched_domain_sysctl();
 
@@ -1875,7 +1861,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 		doms_new = alloc_sched_domains(1);
 		if (doms_new) {
 			n = 1;
-			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+			cpumask_and(doms_new[0], cpu_active_mask,
+				    housekeeping_cpumask(HK_FLAG_DOMAIN));
 		}
 	} else {
 		n = ndoms_new;
@@ -1898,7 +1885,8 @@ match1:
 	if (!doms_new) {
 		n = 0;
 		doms_new = &fallback_doms;
-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		cpumask_and(doms_new[0], cpu_active_mask,
+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
 	}
 
 	/* Build new domains: */
-- 
cgit v1.3-14-g43fede


From 150dfee95feb913876ced5cc559a342384e8ea97 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 27 Oct 2017 04:42:38 +0200
Subject: sched/isolation: Add basic isolcpus flags

Add flags to control NOHZ and domain isolation from "isolcpus=", in
order to centralize the isolation features to a common interface. Domain
isolation remains the default so not to break the existing isolcpus
boot paramater behaviour.

Further flags in the future may include 0hz (1hz tick offload) and timers,
workqueue, RCU, kthread, watchdog, likely all merged together in a
common flag ("async"?). In any case, this will have to be modifiable by
cpusets.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Wanpeng Li <kernellwp@gmail.com>
Link: http://lkml.kernel.org/r/1509072159-31808-12-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/isolation.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 8f666bc5abe8..b71b436f59f2 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/static_key.h>
+#include <linux/ctype.h>
 
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -126,6 +127,29 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
 
 static int __init housekeeping_isolcpus_setup(char *str)
 {
-	return housekeeping_setup(str, HK_FLAG_DOMAIN);
+	unsigned int flags = 0;
+
+	while (isalpha(*str)) {
+		if (!strncmp(str, "nohz,", 5)) {
+			str += 5;
+			flags |= HK_FLAG_TICK;
+			continue;
+		}
+
+		if (!strncmp(str, "domain,", 7)) {
+			str += 7;
+			flags |= HK_FLAG_DOMAIN;
+			continue;
+		}
+
+		pr_warn("isolcpus: Error, unknown flag\n");
+		return 0;
+	}
+
+	/* Default behaviour for isolcpus without flags */
+	if (!flags)
+		flags |= HK_FLAG_DOMAIN;
+
+	return housekeeping_setup(str, flags);
 }
 __setup("isolcpus=", housekeeping_isolcpus_setup);
-- 
cgit v1.3-14-g43fede


From 7d9285e82db5defca4d9674ba089429eeca0c697 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 5 Oct 2017 09:19:19 -0700
Subject: perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf:
 perf event change needed for subsequent bpf helpers"

eBPF programs would like access to the (perf) event enabled and
running times along with the event value, such that they can deal with
event multiplexing (among other things).

This patch extends the interface; a future eBPF patch will utilize
the new functionality.

[ Note, there's a same-content commit with a poor changelog and a meaningless
  title in the networking tree as well - but we need this change for subsequent
  perf work, so apply it here as well, with a proper changelog. Hopefully Git
  will be able to sort out this somewhat messy workflow, if there are no other,
  conflicting changes to these files. ]

Signed-off-by: Yonghong Song <yhs@fb.com>
[ Rewrote the changelog. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <ast@fb.com>
Cc: <daniel@iogearbox.net>
Cc: <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David S. Miller <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20171005161923.332790-2-yhs@fb.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  7 +++++--
 kernel/bpf/arraymap.c      |  2 +-
 kernel/events/core.c       | 15 +++++++++++++--
 kernel/trace/bpf_trace.c   |  2 +-
 4 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8e22f24ded6a..79b18a20cf5d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -806,6 +806,7 @@ struct perf_output_handle {
 struct bpf_perf_event_data_kern {
 	struct pt_regs *regs;
 	struct perf_sample_data *data;
+	struct perf_event *event;
 };
 
 #ifdef CONFIG_CGROUP_PERF
@@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
 				void *context);
 extern void perf_pmu_migrate_context(struct pmu *pmu,
 				int src_cpu, int dst_cpu);
-int perf_event_read_local(struct perf_event *event, u64 *value);
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running);
 extern u64 perf_event_read_value(struct perf_event *event,
 				 u64 *enabled, u64 *running);
 
@@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline int perf_event_read_local(struct perf_event *event, u64 *value)
+static inline int perf_event_read_local(struct perf_event *event, u64 *value,
+					u64 *enabled, u64 *running)
 {
 	return -EINVAL;
 }
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e2636737b69b..c4b9ab01bba5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
 
 	ee = ERR_PTR(-EOPNOTSUPP);
 	event = perf_file->private_data;
-	if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+	if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
 		goto err_out;
 
 	ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 04989fb769f0..74671e101b92 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event)
  *     will not be local and we cannot read them atomically
  *   - must not have a pmu::count method
  */
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+			  u64 *enabled, u64 *running)
 {
 	unsigned long flags;
 	int ret = 0;
+	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}
 
+	now = event->shadow_ctx_time + perf_clock();
+	if (enabled)
+		*enabled = now - event->tstamp_enabled;
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id())
+	if (event->oncpu == smp_processor_id()) {
 		event->pmu->read(event);
+		if (running)
+			*running = now - event->tstamp_running;
+	} else if (running) {
+		*running = event->total_time_running;
+	}
 
 	*value = local64_read(&event->count);
 out:
@@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event,
 	struct bpf_perf_event_data_kern ctx = {
 		.data = data,
 		.regs = regs,
+		.event = event,
 	};
 	int ret = 0;
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value);
+	err = perf_event_read_local(ee->event, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
-- 
cgit v1.3-14-g43fede


From ca0dd44cf37bfe316751e1701439c3ff44beb05c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 13:23:44 +0200
Subject: perf/core: Fix perf_event_read_value() locking

perf_event_read_value() is an external accessor, just like
perf_event_{en,dis}able() and should thus use perf_event_ctx_lock().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f63a8daa5812 ("perf: Fix event->ctx locking")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 74671e101b92..d636f1d0e2ab 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4398,7 +4398,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 {
 	struct perf_event *child;
 	u64 total = 0;
@@ -4426,6 +4426,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 
 	return total;
 }
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+	struct perf_event_context *ctx;
+	u64 count;
+
+	ctx = perf_event_ctx_lock(event);
+	count = __perf_event_read_value(event, enabled, running);
+	perf_event_ctx_unlock(event, ctx);
+
+	return count;
+}
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
 static int __perf_read_group_add(struct perf_event *leader,
@@ -4528,7 +4540,7 @@ static int perf_read_one(struct perf_event *event,
 	u64 values[4];
 	int n = 0;
 
-	values[n++] = perf_event_read_value(event, &enabled, &running);
+	values[n++] = __perf_event_read_value(event, &enabled, &running);
 	if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = enabled;
 	if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-- 
cgit v1.3-14-g43fede


From 0ee098c97a6ebe163180e74b740c0a37bcdaa173 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 13:24:28 +0200
Subject: perf/core: Update ctx time before detaching events

We should make sure the ctx time is updated before we detach events;
which will want to update event times.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index d636f1d0e2ab..5fae98626b15 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11129,6 +11129,7 @@ static void __perf_event_exit_context(void *__info)
 	struct perf_event *event;
 
 	raw_spin_lock(&ctx->lock);
+	ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 	list_for_each_entry(event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
 	raw_spin_unlock(&ctx->lock);
-- 
cgit v1.3-14-g43fede


From a9cd8194e1e6bd09619954721dfaf0f94fe2003e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 13:38:24 +0200
Subject: perf/core: Fix __perf_read_group_add() locking

Event timestamps are serialized using ctx->lock, make sure to hold it
over reading all values.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5fae98626b15..0f34f2a47eb6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4453,6 +4453,8 @@ static int __perf_read_group_add(struct perf_event *leader,
 	if (ret)
 		return ret;
 
+	raw_spin_lock_irqsave(&ctx->lock, flags);
+
 	/*
 	 * Since we co-schedule groups, {enabled,running} times of siblings
 	 * will be identical to those of the leader, so we only publish one
@@ -4475,8 +4477,6 @@ static int __perf_read_group_add(struct perf_event *leader,
 	if (read_format & PERF_FORMAT_ID)
 		values[n++] = primary_event_id(leader);
 
-	raw_spin_lock_irqsave(&ctx->lock, flags);
-
 	list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 		values[n++] += perf_event_count(sub);
 		if (read_format & PERF_FORMAT_ID)
-- 
cgit v1.3-14-g43fede


From 3c5c8711dcb32115156cc7672fa095afa4339eaa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 13:44:51 +0200
Subject: perf/core: Make sure to update ctx time before using it

We should make sure to update ctx time before we use it to update
event times.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f34f2a47eb6..b249e0f197a7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1893,6 +1893,11 @@ __perf_remove_from_context(struct perf_event *event,
 {
 	unsigned long flags = (unsigned long)info;
 
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_cpuctx(cpuctx);
+	}
+
 	event_sched_out(event, cpuctx, ctx);
 	if (flags & DETACH_GROUP)
 		perf_group_detach(event);
@@ -1955,8 +1960,11 @@ static void __perf_event_disable(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return;
 
-	update_context_time(ctx);
-	update_cgrp_time_from_event(event);
+	if (ctx->is_active & EVENT_TIME) {
+		update_context_time(ctx);
+		update_cgrp_time_from_event(event);
+	}
+
 	update_group_times(event);
 	if (event == event->group_leader)
 		group_sched_out(event, cpuctx, ctx);
-- 
cgit v1.3-14-g43fede


From 8ca2bd41c7d1c135e9ac6f25970c2d491865088a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 14:12:35 +0200
Subject: perf/core: Rename 'enum perf_event_active_state'

Its a weird name, active is one of the states, it should not be part
of the name, also, its too long.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 6 +++---
 kernel/events/core.c       | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 79b18a20cf5d..b7532650de47 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -485,9 +485,9 @@ struct perf_addr_filters_head {
 };
 
 /**
- * enum perf_event_active_state - the states of a event
+ * enum perf_event_state - the states of a event
  */
-enum perf_event_active_state {
+enum perf_event_state {
 	PERF_EVENT_STATE_DEAD		= -4,
 	PERF_EVENT_STATE_EXIT		= -3,
 	PERF_EVENT_STATE_ERROR		= -2,
@@ -578,7 +578,7 @@ struct perf_event {
 	struct pmu			*pmu;
 	void				*pmu_private;
 
-	enum perf_event_active_state	state;
+	enum perf_event_state		state;
 	unsigned int			attach_state;
 	local64_t			count;
 	atomic64_t			child_count;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b249e0f197a7..6322e245176c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10801,7 +10801,7 @@ inherit_event(struct perf_event *parent_event,
 	      struct perf_event *group_leader,
 	      struct perf_event_context *child_ctx)
 {
-	enum perf_event_active_state parent_state = parent_event->state;
+	enum perf_event_state parent_state = parent_event->state;
 	struct perf_event *child_event;
 	unsigned long flags;
 
-- 
cgit v1.3-14-g43fede


From 7f0ec32526d2446fdd79b0c6c6d08e64ef9fb1f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 14:17:58 +0200
Subject: perf/core: Remove wrong barrier

The barrier and comment make no sense:

 - if what the barrier says is true, it should be wmb() but that
   should then be part of the arch driver, not the generic code.

 - if it is an SMP barrier, there must be a matching barrier, and
   there isn't one.

So kill it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6322e245176c..205a4321f678 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2097,11 +2097,6 @@ event_sched_in(struct perf_event *event,
 		event->hw.interrupts = 0;
 	}
 
-	/*
-	 * The new state must be visible before we turn it on in the hardware:
-	 */
-	smp_wmb();
-
 	perf_pmu_disable(event->pmu);
 
 	perf_set_shadow_time(event, ctx, tstamp);
-- 
cgit v1.3-14-g43fede


From 0c1cbc18df9e38182a0604b15535699c84d7342a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 16:26:44 +0200
Subject: perf/core: Fix perf_event_read()

perf_event_read() has a number of issues regarding the timekeeping bits.

 - The IPI didn't update group times when it found INACTIVE

 - The direct call would not re-check ->state after taking ctx->lock
   which can result in ->count and timestamps getting out of sync.

And we can make use of the ordering introduced for perf_event_stop()
to make it more accurate for ACTIVE.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 55 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 205a4321f678..6f74f9c35490 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2081,8 +2081,9 @@ event_sched_in(struct perf_event *event,
 
 	WRITE_ONCE(event->oncpu, smp_processor_id());
 	/*
-	 * Order event::oncpu write to happen before the ACTIVE state
-	 * is visible.
+	 * Order event::oncpu write to happen before the ACTIVE state is
+	 * visible. This allows perf_event_{stop,read}() to observe the correct
+	 * ->oncpu if it sees ACTIVE.
 	 */
 	smp_wmb();
 	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
@@ -3638,12 +3639,16 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	if (ctx->is_active) {
+	if (ctx->is_active & EVENT_TIME) {
 		update_context_time(ctx);
 		update_cgrp_time_from_event(event);
 	}
 
-	update_event_times(event);
+	if (!data->group)
+		update_event_times(event);
+	else
+		update_group_times(event);
+
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		goto unlock;
 
@@ -3658,7 +3663,6 @@ static void __perf_event_read(void *info)
 	pmu->read(event);
 
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		update_event_times(sub);
 		if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 			/*
 			 * Use sibling's PMU rather than @event's since
@@ -3748,23 +3752,35 @@ out:
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
+	enum perf_event_state state = READ_ONCE(event->state);
 	int event_cpu, ret = 0;
 
 	/*
 	 * If event is enabled and currently active on a CPU, update the
 	 * value in the event structure:
 	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
-		struct perf_read_data data = {
-			.event = event,
-			.group = group,
-			.ret = 0,
-		};
+again:
+	if (state == PERF_EVENT_STATE_ACTIVE) {
+		struct perf_read_data data;
+
+		/*
+		 * Orders the ->state and ->oncpu loads such that if we see
+		 * ACTIVE we must also see the right ->oncpu.
+		 *
+		 * Matches the smp_wmb() from event_sched_in().
+		 */
+		smp_rmb();
 
 		event_cpu = READ_ONCE(event->oncpu);
 		if ((unsigned)event_cpu >= nr_cpu_ids)
 			return 0;
 
+		data = (struct perf_read_data){
+			.event = event,
+			.group = group,
+			.ret = 0,
+		};
+
 		preempt_disable();
 		event_cpu = __perf_event_read_cpu(event, event_cpu);
 
@@ -3781,20 +3797,27 @@ static int perf_event_read(struct perf_event *event, bool group)
 		(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
 		preempt_enable();
 		ret = data.ret;
-	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+	} else if (state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
 
 		raw_spin_lock_irqsave(&ctx->lock, flags);
+		state = event->state;
+		if (state != PERF_EVENT_STATE_INACTIVE) {
+			raw_spin_unlock_irqrestore(&ctx->lock, flags);
+			goto again;
+		}
+
 		/*
-		 * may read while context is not active
-		 * (e.g., thread is blocked), in that case
-		 * we cannot update context time
+		 * May read while context is not active (e.g., thread is
+		 * blocked), in that case we cannot update context time
 		 */
-		if (ctx->is_active) {
+		if (ctx->is_active & EVENT_TIME) {
 			update_context_time(ctx);
 			update_cgrp_time_from_event(event);
 		}
+
 		if (group)
 			update_group_times(event);
 		else
-- 
cgit v1.3-14-g43fede


From 0d3d73aac2ff05c78387aa9dcc2c8aa3804405e7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 5 Sep 2017 14:16:28 +0200
Subject: perf/core: Rewrite event timekeeping

The current even timekeeping, which computes enabled and running
times, uses 3 distinct timestamps to reflect the various event states:
OFF (stopped), INACTIVE (enabled) and ACTIVE (running).

Furthermore, the update rules are such that even INACTIVE events need
their timestamps updated. This is undesirable because we'd like to not
touch INACTIVE events if at all possible, this makes event scheduling
(much) more expensive than needed.

Rewrite the timekeeping to directly use event->state, this greatly
simplifies the code and results in only having to update things when
we change state, or an up-to-date value is requested (read).

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  19 +--
 kernel/events/core.c       | 385 +++++++++++++++------------------------------
 2 files changed, 130 insertions(+), 274 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b7532650de47..874b71a70058 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -588,26 +588,10 @@ struct perf_event {
 	 * has been enabled (i.e. eligible to run, and the task has
 	 * been scheduled in, if this is a per-task event)
 	 * and running (scheduled onto the CPU), respectively.
-	 *
-	 * They are computed from tstamp_enabled, tstamp_running and
-	 * tstamp_stopped when the event is in INACTIVE or ACTIVE state.
 	 */
 	u64				total_time_enabled;
 	u64				total_time_running;
-
-	/*
-	 * These are timestamps used for computing total_time_enabled
-	 * and total_time_running when the event is in INACTIVE or
-	 * ACTIVE state, measured in nanoseconds from an arbitrary point
-	 * in time.
-	 * tstamp_enabled: the notional time when the event was enabled
-	 * tstamp_running: the notional time when the event was scheduled on
-	 * tstamp_stopped: in INACTIVE state, the notional time when the
-	 *	event was scheduled off.
-	 */
-	u64				tstamp_enabled;
-	u64				tstamp_running;
-	u64				tstamp_stopped;
+	u64				tstamp;
 
 	/*
 	 * timestamp shadows the actual context timing but it can
@@ -699,7 +683,6 @@ struct perf_event {
 
 #ifdef CONFIG_CGROUP_PERF
 	struct perf_cgroup		*cgrp; /* cgroup event is attach to */
-	int				cgrp_defer_enabled;
 #endif
 
 	struct list_head		sb_list;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6f74f9c35490..2551e8ce7224 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
 	return event->clock();
 }
 
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+	struct perf_event *leader = event->group_leader;
+
+	if (leader->state <= PERF_EVENT_STATE_OFF)
+		return leader->state;
+
+	return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+	enum perf_event_state state = __perf_effective_state(event);
+	u64 delta = now - event->tstamp;
+
+	*enabled = event->total_time_enabled;
+	if (state >= PERF_EVENT_STATE_INACTIVE)
+		*enabled += delta;
+
+	*running = event->total_time_running;
+	if (state >= PERF_EVENT_STATE_ACTIVE)
+		*running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+	u64 now = perf_event_time(event);
+
+	__perf_update_times(event, now, &event->total_time_enabled,
+					&event->total_time_running);
+	event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+	struct perf_event *sibling;
+
+	list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+		perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+	if (event->state == state)
+		return;
+
+	perf_event_update_time(event);
+	/*
+	 * If a group leader gets enabled/disabled all its siblings
+	 * are affected too.
+	 */
+	if ((event->state < 0) ^ (state < 0))
+		perf_event_update_sibling_time(event);
+
+	WRITE_ONCE(event->state, state);
+}
+
 #ifdef CONFIG_CGROUP_PERF
 
 static inline bool
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 	event->shadow_ctx_time = now - t->timestamp;
 }
 
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-	/*
-	 * when the current task's perf cgroup does not match
-	 * the event's, we need to remember to call the
-	 * perf_mark_enable() function the first time a task with
-	 * a matching perf cgroup is scheduled in.
-	 */
-	if (is_cgroup_event(event) && !perf_cgroup_match(event))
-		event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
-			 struct perf_event_context *ctx)
-{
-	struct perf_event *sub;
-	u64 tstamp = perf_event_time(event);
-
-	if (!event->cgrp_defer_enabled)
-		return;
-
-	event->cgrp_defer_enabled = 0;
-
-	event->tstamp_enabled = tstamp - event->total_time_enabled;
-	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
-			sub->cgrp_defer_enabled = 0;
-		}
-	}
-}
-
 /*
  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
  * cleared when last cgroup event is removed.
@@ -972,17 +1020,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
 	return 0;
 }
 
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
-			 struct perf_event_context *ctx)
-{
-}
-
 static inline void
 list_update_cgroup_event(struct perf_event *event,
 			 struct perf_event_context *ctx, bool add)
@@ -1396,60 +1433,6 @@ static u64 perf_event_time(struct perf_event *event)
 	return ctx ? ctx->time : 0;
 }
 
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
-	struct perf_event_context *ctx = event->ctx;
-	u64 run_end;
-
-	lockdep_assert_held(&ctx->lock);
-
-	if (event->state < PERF_EVENT_STATE_INACTIVE ||
-	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
-		return;
-
-	/*
-	 * in cgroup mode, time_enabled represents
-	 * the time the event was enabled AND active
-	 * tasks were in the monitored cgroup. This is
-	 * independent of the activity of the context as
-	 * there may be a mix of cgroup and non-cgroup events.
-	 *
-	 * That is why we treat cgroup events differently
-	 * here.
-	 */
-	if (is_cgroup_event(event))
-		run_end = perf_cgroup_event_time(event);
-	else if (ctx->is_active)
-		run_end = ctx->time;
-	else
-		run_end = event->tstamp_stopped;
-
-	event->total_time_enabled = run_end - event->tstamp_enabled;
-
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		run_end = event->tstamp_stopped;
-	else
-		run_end = perf_event_time(event);
-
-	event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
-	struct perf_event *event;
-
-	update_event_times(leader);
-	list_for_each_entry(event, &leader->sibling_list, group_entry)
-		update_event_times(event);
-}
-
 static enum event_type_t get_event_type(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
@@ -1492,6 +1475,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
 	event->attach_state |= PERF_ATTACH_CONTEXT;
 
+	event->tstamp = perf_event_time(event);
+
 	/*
 	 * If we're a stand alone event or group leader, we go to the context
 	 * list, group events are kept attached to the group so that
@@ -1699,8 +1684,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (event->group_leader == event)
 		list_del_init(&event->group_entry);
 
-	update_group_times(event);
-
 	/*
 	 * If event was in error state, then keep it
 	 * that way, otherwise bogus counts will be
@@ -1709,7 +1692,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	 * of the event
 	 */
 	if (event->state > PERF_EVENT_STATE_OFF)
-		event->state = PERF_EVENT_STATE_OFF;
+		perf_event_set_state(event, PERF_EVENT_STATE_OFF);
 
 	ctx->generation++;
 }
@@ -1808,38 +1791,24 @@ event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-	u64 delta;
+	enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
 
 	WARN_ON_ONCE(event->ctx != ctx);
 	lockdep_assert_held(&ctx->lock);
 
-	/*
-	 * An event which could not be activated because of
-	 * filter mismatch still needs to have its timings
-	 * maintained, otherwise bogus information is return
-	 * via read() for time_enabled, time_running:
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE &&
-	    !event_filter_match(event)) {
-		delta = tstamp - event->tstamp_stopped;
-		event->tstamp_running += delta;
-		event->tstamp_stopped = tstamp;
-	}
-
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		return;
 
 	perf_pmu_disable(event->pmu);
 
-	event->tstamp_stopped = tstamp;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
-	event->state = PERF_EVENT_STATE_INACTIVE;
+
 	if (event->pending_disable) {
 		event->pending_disable = 0;
-		event->state = PERF_EVENT_STATE_OFF;
+		state = PERF_EVENT_STATE_OFF;
 	}
+	perf_event_set_state(event, state);
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu--;
@@ -1859,7 +1828,9 @@ group_sched_out(struct perf_event *group_event,
 		struct perf_event_context *ctx)
 {
 	struct perf_event *event;
-	int state = group_event->state;
+
+	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+		return;
 
 	perf_pmu_disable(ctx->pmu);
 
@@ -1873,7 +1844,7 @@ group_sched_out(struct perf_event *group_event,
 
 	perf_pmu_enable(ctx->pmu);
 
-	if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+	if (group_event->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -1965,12 +1936,12 @@ static void __perf_event_disable(struct perf_event *event,
 		update_cgrp_time_from_event(event);
 	}
 
-	update_group_times(event);
 	if (event == event->group_leader)
 		group_sched_out(event, cpuctx, ctx);
 	else
 		event_sched_out(event, cpuctx, ctx);
-	event->state = PERF_EVENT_STATE_OFF;
+
+	perf_event_set_state(event, PERF_EVENT_STATE_OFF);
 }
 
 /*
@@ -2027,8 +1998,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
 }
 
 static void perf_set_shadow_time(struct perf_event *event,
-				 struct perf_event_context *ctx,
-				 u64 tstamp)
+				 struct perf_event_context *ctx)
 {
 	/*
 	 * use the correct time source for the time snapshot
@@ -2056,9 +2026,9 @@ static void perf_set_shadow_time(struct perf_event *event,
 	 * is cleaner and simpler to understand.
 	 */
 	if (is_cgroup_event(event))
-		perf_cgroup_set_shadow_time(event, tstamp);
+		perf_cgroup_set_shadow_time(event, event->tstamp);
 	else
-		event->shadow_ctx_time = tstamp - ctx->timestamp;
+		event->shadow_ctx_time = event->tstamp - ctx->timestamp;
 }
 
 #define MAX_INTERRUPTS (~0ULL)
@@ -2071,7 +2041,6 @@ event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
 	int ret = 0;
 
 	lockdep_assert_held(&ctx->lock);
@@ -2086,7 +2055,7 @@ event_sched_in(struct perf_event *event,
 	 * ->oncpu if it sees ACTIVE.
 	 */
 	smp_wmb();
-	WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+	perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
 
 	/*
 	 * Unthrottle events, since we scheduled we might have missed several
@@ -2100,19 +2069,17 @@ event_sched_in(struct perf_event *event,
 
 	perf_pmu_disable(event->pmu);
 
-	perf_set_shadow_time(event, ctx, tstamp);
+	perf_set_shadow_time(event, ctx);
 
 	perf_log_itrace_start(event);
 
 	if (event->pmu->add(event, PERF_EF_START)) {
-		event->state = PERF_EVENT_STATE_INACTIVE;
+		perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 		event->oncpu = -1;
 		ret = -EAGAIN;
 		goto out;
 	}
 
-	event->tstamp_running += tstamp - event->tstamp_stopped;
-
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
 	if (!ctx->nr_active++)
@@ -2136,8 +2103,6 @@ group_sched_in(struct perf_event *group_event,
 {
 	struct perf_event *event, *partial_group = NULL;
 	struct pmu *pmu = ctx->pmu;
-	u64 now = ctx->time;
-	bool simulate = false;
 
 	if (group_event->state == PERF_EVENT_STATE_OFF)
 		return 0;
@@ -2167,27 +2132,13 @@ group_error:
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
-	 * The events up to the failed event are scheduled out normally,
-	 * tstamp_stopped will be updated.
-	 *
-	 * The failed events and the remaining siblings need to have
-	 * their timings updated as if they had gone thru event_sched_in()
-	 * and event_sched_out(). This is required to get consistent timings
-	 * across the group. This also takes care of the case where the group
-	 * could never be scheduled by ensuring tstamp_stopped is set to mark
-	 * the time the event was actually stopped, such that time delta
-	 * calculation in update_event_times() is correct.
+	 * The events up to the failed event are scheduled out normally.
 	 */
 	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 		if (event == partial_group)
-			simulate = true;
+			break;
 
-		if (simulate) {
-			event->tstamp_running += now - event->tstamp_stopped;
-			event->tstamp_stopped = now;
-		} else {
-			event_sched_out(event, cpuctx, ctx);
-		}
+		event_sched_out(event, cpuctx, ctx);
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
@@ -2229,46 +2180,11 @@ static int group_can_go_on(struct perf_event *event,
 	return can_add_hw;
 }
 
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- *   total_time_enabled = tstamp_stopped - tstamp_enabled
- *   total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
-	WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
-	event->tstamp_stopped = now;
-	event->tstamp_enabled = now - event->total_time_enabled;
-	event->tstamp_running = now - event->total_time_running;
-}
-
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
-	u64 tstamp = perf_event_time(event);
-
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	/*
-	 * We can be called with event->state == STATE_OFF when we create with
-	 * .disabled = 1. In that case the IOC_ENABLE will call this function.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE)
-		__perf_event_enable_time(event, tstamp);
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2499,28 +2415,6 @@ again:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
-	struct perf_event *sub;
-	u64 tstamp = perf_event_time(event);
-
-	event->state = PERF_EVENT_STATE_INACTIVE;
-	__perf_event_enable_time(event, tstamp);
-	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		/* XXX should not be > INACTIVE if event isn't */
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
-			__perf_event_enable_time(sub, tstamp);
-	}
-}
-
 /*
  * Cross CPU call to enable a performance event
  */
@@ -2539,14 +2433,12 @@ static void __perf_event_enable(struct perf_event *event,
 	if (ctx->is_active)
 		ctx_sched_out(ctx, cpuctx, EVENT_TIME);
 
-	__perf_event_mark_enabled(event);
+	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 
 	if (!ctx->is_active)
 		return;
 
 	if (!event_filter_match(event)) {
-		if (is_cgroup_event(event))
-			perf_cgroup_defer_enabled(event);
 		ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
 		return;
 	}
@@ -2866,18 +2758,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
 	 * we know the event must be on the current CPU, therefore we
 	 * don't need to use it.
 	 */
-	switch (event->state) {
-	case PERF_EVENT_STATE_ACTIVE:
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
 		event->pmu->read(event);
-		/* fall-through */
 
-	case PERF_EVENT_STATE_INACTIVE:
-		update_event_times(event);
-		break;
-
-	default:
-		break;
-	}
+	perf_event_update_time(event);
 
 	/*
 	 * In order to keep per-task stats reliable we need to flip the event
@@ -3114,10 +2998,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
 		if (group_can_go_on(event, cpuctx, 1))
 			group_sched_in(event, cpuctx, ctx);
 
@@ -3125,10 +3005,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 		 * If this pinned group hasn't been scheduled,
 		 * put it in error state.
 		 */
-		if (event->state == PERF_EVENT_STATE_INACTIVE) {
-			update_group_times(event);
-			event->state = PERF_EVENT_STATE_ERROR;
-		}
+		if (event->state == PERF_EVENT_STATE_INACTIVE)
+			perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 	}
 }
 
@@ -3150,10 +3028,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		if (!event_filter_match(event))
 			continue;
 
-		/* may need to reset tstamp_enabled */
-		if (is_cgroup_event(event))
-			perf_cgroup_mark_enabled(event, ctx);
-
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
 			if (group_sched_in(event, cpuctx, ctx))
 				can_add_hw = 0;
@@ -3545,7 +3419,7 @@ static int event_enable_on_exec(struct perf_event *event,
 	if (event->state >= PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	__perf_event_mark_enabled(event);
+	perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
 
 	return 1;
 }
@@ -3644,10 +3518,9 @@ static void __perf_event_read(void *info)
 		update_cgrp_time_from_event(event);
 	}
 
-	if (!data->group)
-		update_event_times(event);
-	else
-		update_group_times(event);
+	perf_event_update_time(event);
+	if (data->group)
+		perf_event_update_sibling_time(event);
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
 		goto unlock;
@@ -3696,7 +3569,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 {
 	unsigned long flags;
 	int ret = 0;
-	u64 now;
 
 	/*
 	 * Disabling interrupts avoids all counter scheduling (context
@@ -3727,23 +3599,26 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 		goto out;
 	}
 
-	now = event->shadow_ctx_time + perf_clock();
-	if (enabled)
-		*enabled = now - event->tstamp_enabled;
+
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
 	 * oncpu == -1).
 	 */
-	if (event->oncpu == smp_processor_id()) {
+	if (event->oncpu == smp_processor_id())
 		event->pmu->read(event);
-		if (running)
-			*running = now - event->tstamp_running;
-	} else if (running) {
-		*running = event->total_time_running;
-	}
 
 	*value = local64_read(&event->count);
+	if (enabled || running) {
+		u64 now = event->shadow_ctx_time + perf_clock();
+		u64 __enabled, __running;
+
+		__perf_update_times(event, now, &__enabled, &__running);
+		if (enabled)
+			*enabled = __enabled;
+		if (running)
+			*running = __running;
+	}
 out:
 	local_irq_restore(flags);
 
@@ -3818,10 +3693,9 @@ again:
 			update_cgrp_time_from_event(event);
 		}
 
+		perf_event_update_time(event);
 		if (group)
-			update_group_times(event);
-		else
-			update_event_times(event);
+			perf_event_update_sibling_time(event);
 		raw_spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -4945,8 +4819,7 @@ static void calc_timer_values(struct perf_event *event,
 
 	*now = perf_clock();
 	ctx_time = event->shadow_ctx_time + *now;
-	*enabled = ctx_time - event->tstamp_enabled;
-	*running = ctx_time - event->tstamp_running;
+	__perf_update_times(event, ctx_time, enabled, running);
 }
 
 static void perf_event_init_userpage(struct perf_event *event)
@@ -10581,7 +10454,7 @@ perf_event_exit_event(struct perf_event *child_event,
 	if (parent_event)
 		perf_group_detach(child_event);
 	list_del_event(child_event, child_ctx);
-	child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+	perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
 	raw_spin_unlock_irq(&child_ctx->lock);
 
 	/*
-- 
cgit v1.3-14-g43fede


From 035226b964c820f65e201cdf123705a8f1d7c670 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Thu, 26 Oct 2017 01:47:42 +0000
Subject: bpf: remove tail_call and get_stackid helper declarations from bpf.h

commit afdb09c720b6 ("security: bpf: Add LSM hooks for bpf object related
syscall") included linux/bpf.h in linux/security.h. As a result, bpf
programs including bpf_helpers.h and some other header that ends up
pulling in also security.h, such as several examples under samples/bpf,
fail to compile because bpf_tail_call and bpf_get_stackid are now
"redefined as different kind of symbol".

>From bpf.h:

u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);

Whereas in bpf_helpers.h they are:

static void (*bpf_tail_call)(void *ctx, void *map, int index);
static int (*bpf_get_stackid)(void *ctx, void *map, int flags);

Fix this by removing the unused declaration of bpf_tail_call and moving
the declaration of bpf_get_stackid in bpf_trace.c, which is the only
place where it's needed.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      | 3 ---
 kernel/trace/bpf_trace.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 172be7faf7ba..520aeebe0d93 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -231,9 +231,6 @@ struct bpf_event_entry {
 	struct rcu_head rcu;
 };
 
-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
-u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
-
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
 int bpf_prog_calc_tag(struct bpf_prog *fp);
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index b65011d320e3..136aa6bb0422 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,6 +15,8 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 /**
  * trace_call_bpf - invoke BPF program
  * @call: tracepoint event
-- 
cgit v1.3-14-g43fede


From 250a53d6fcd86012935d1cf71eb2e3d6e88c412c Mon Sep 17 00:00:00 2001
From: Christoffer Dall <cdall@linaro.org>
Date: Fri, 27 Oct 2017 10:34:33 +0200
Subject: genirq: Document vcpu_info usage for percpu_devid interrupts

It is currently unclear how to set the VCPU affinity for a percpu_devid
interrupt , since the Linux irq_data structure describes the state for
multiple interrupts, one for each physical CPU on the system.  Since
each such interrupt can be associated with different VCPUs or none at
all, associating a single VCPU state with such an interrupt does not
capture the necessary semantics.

The implementers of irq_set_affinity are the Intel and AMD IOMMUs, and
the ARM GIC irqchip.  The Intel and AMD callers do not appear to use
percpu_devid interrupts, and the ARM GIC implementation only checks the
pointer against NULL vs. non-NULL.

Therefore, simply update the function documentation to explain the
expected use in the context of percpu_devid interrupts, allowing future
changes or additions to irqchip implementers to do the right thing.

Signed-off-by: Christoffer Dall <cdall@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: kvm@vger.kernel.org
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: kvmarm@lists.cs.columbia.edu
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lkml.kernel.org/r/1509093281-15225-13-git-send-email-cdall@linaro.org
---
 kernel/irq/manage.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e667912d0e9c..c65f282cbc9a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -381,7 +381,8 @@ int irq_select_affinity_usr(unsigned int irq)
 /**
  *	irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
  *	@irq: interrupt number to set affinity
- *	@vcpu_info: vCPU specific data
+ *	@vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
+ *	            specific data for percpu_devid interrupts
  *
  *	This function uses the vCPU specific data to set the vCPU
  *	affinity for an irq. The vCPU specific data is passed from
-- 
cgit v1.3-14-g43fede


From bc8293663b953c23ff7b73eb15f82393425e5e47 Mon Sep 17 00:00:00 2001
From: Baoquan He <bhe@redhat.com>
Date: Sun, 22 Oct 2017 22:30:55 +0800
Subject: printk: fix typo in printk_safe.c

Link: http://lkml.kernel.org/r/1508682655-27293-1-git-send-email-bhe@redhat.com
Cc: rostedt@goodmis.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Baoquan He <bhe@redhat.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk_safe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..89558b85f45c 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -75,7 +75,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
  * have dedicated buffers, because otherwise printk-safe preempted by
  * NMI-printk would have overwritten the NMI messages.
  *
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
  * from other CPU, concurrently with printk_safe_log_store(). Should this
  * happen, printk_safe_log_store() will notice the buffer->len mismatch
  * and repeat the write.
-- 
cgit v1.3-14-g43fede


From cef572ad9bd7f85035ba8272e5352040e8be0152 Mon Sep 17 00:00:00 2001
From: Li Bin <huawei.libin@huawei.com>
Date: Sat, 28 Oct 2017 11:07:28 +0800
Subject: workqueue: Fix NULL pointer dereference

When queue_work() is used in irq (not in task context), there is
a potential case that trigger NULL pointer dereference.
----------------------------------------------------------------
worker_thread()
|-spin_lock_irq()
|-process_one_work()
	|-worker->current_pwq = pwq
	|-spin_unlock_irq()
	|-worker->current_func(work)
	|-spin_lock_irq()
 	|-worker->current_pwq = NULL
|-spin_unlock_irq()

				//interrupt here
				|-irq_handler
					|-__queue_work()
						//assuming that the wq is draining
						|-is_chained_work(wq)
							|-current_wq_worker()
							//Here, 'current' is the interrupted worker!
								|-current->current_pwq is NULL here!
|-schedule()
----------------------------------------------------------------

Avoid it by checking for task context in current_wq_worker(), and
if not in task context, we shouldn't use the 'current' to check the
condition.

Reported-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Li Bin <huawei.libin@huawei.com>
Reviewed-by: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 8d03ecfe4718 ("workqueue: reimplement is_chained_work() using current_wq_worker()")
Cc: stable@vger.kernel.org # v3.9+
---
 kernel/workqueue_internal.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 8635417c587b..29fa81f0f51a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -9,6 +9,7 @@
 
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
+#include <linux/preempt.h>
 
 struct worker_pool;
 
@@ -59,7 +60,7 @@ struct worker {
  */
 static inline struct worker *current_wq_worker(void)
 {
-	if (current->flags & PF_WQ_WORKER)
+	if (in_task() && (current->flags & PF_WQ_WORKER))
 		return kthread_data(current);
 	return NULL;
 }
-- 
cgit v1.3-14-g43fede


From c3ba13298709f46e72b22d087d0aa02bd012e4b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 30 Oct 2017 08:13:14 -0700
Subject: cgroup: mark @cgrp __maybe_unused in cpu_stat_show()

The local variable @cgrp isn't used if !CONFIG_CGROUP_SCHED.  Mark the
variable with __maybe_unused to avoid a compile warning.

Reported-by: "kbuild-all@01.org" <kbuild-all@01.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9773e49a1b4..d6ed725f36d9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3357,7 +3357,7 @@ static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
 
 static int cpu_stat_show(struct seq_file *seq, void *v)
 {
-	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
 	int ret = 0;
 
 	cgroup_stat_show_cputime(seq);
-- 
cgit v1.3-14-g43fede


From 9afe77ed849de6af8532b4c1b9310102eed9edf7 Mon Sep 17 00:00:00 2001
From: Maxim Akristiniy <maksim.akristiniy@yotadevices.com>
Date: Mon, 23 Oct 2017 19:51:48 +0300
Subject: added new line symbol after warning about dropped messages

so this message will not mess with the next one

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Maxim Akristiniy <maksim.akristiniy@yotadevices.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Petr Mladek <pmladek@suse.com>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..5d81206a572d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
 		}
 
 		if (console_seq < log_first_seq) {
-			len = sprintf(text, "** %u printk messages dropped ** ",
+			len = sprintf(text, "** %u printk messages dropped **\n",
 				      (unsigned)(log_first_seq - console_seq));
 
 			/* messages are gone, move to first one */
-- 
cgit v1.3-14-g43fede


From 0f295b0650c90362b4111f46d7f9149a0a4191be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Date: Fri, 13 Oct 2017 11:54:33 -0600
Subject: rtc: Allow rtc drivers to specify the tv_nsec value for ntp

ntp is currently hardwired to try and call the rtc set when wall clock
tv_nsec is 0.5 seconds. This historical behaviour works well with certain
PC RTCs, but is not universal to all rtc hardware.

Change how this works by introducing the driver specific concept of
set_offset_nsec, the delay between current wall clock time and the target
time to set (with a 0 tv_nsecs).

For x86-style CMOS set_offset_nsec should be -0.5 s which causes the last
second to be written 0.5 s after it has started.

For compat with the old rtc_set_ntp_time, the value is defaulted to
+ 0.5 s, which causes the next second to be written 0.5s before it starts,
as things were before this patch.

Testing shows many non-x86 RTCs would like set_offset_nsec ~= 0,
so ultimately each RTC driver should set the set_offset_nsec according
to its needs, and non x86 architectures should stop using
update_persistent_clock64 in order to access this feature.
Future patches will revise the drivers as needed.

Since CMOS and RTC now have very different handling they are split
into two dedicated code paths, sharing the support code, and ifdefs
are replaced with IS_ENABLED.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/rtc/class.c   |   3 +
 drivers/rtc/systohc.c |  53 +++++++++++-----
 include/linux/rtc.h   |  43 ++++++++++++-
 kernel/time/ntp.c     | 166 ++++++++++++++++++++++++++++++++++----------------
 4 files changed, 196 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 2ed970d61da1..722d683e0b0f 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -161,6 +161,9 @@ static struct rtc_device *rtc_allocate_device(void)
 
 	device_initialize(&rtc->dev);
 
+	/* Drivers can revise this default after allocating the device. */
+	rtc->set_offset_nsec =  NSEC_PER_SEC / 2;
+
 	rtc->irq_freq = 1;
 	rtc->max_user_freq = 64;
 	rtc->dev.class = rtc_class;
diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
index b4a68ffcd06b..0c177647ea6c 100644
--- a/drivers/rtc/systohc.c
+++ b/drivers/rtc/systohc.c
@@ -10,6 +10,7 @@
 /**
  * rtc_set_ntp_time - Save NTP synchronized time to the RTC
  * @now: Current time of day
+ * @target_nsec: pointer for desired now->tv_nsec value
  *
  * Replacement for the NTP platform function update_persistent_clock64
  * that stores time for later retrieval by rtc_hctosys.
@@ -18,30 +19,52 @@
  * possible at all, and various other -errno for specific temporary failure
  * cases.
  *
+ * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec.
+ (
  * If temporary failure is indicated the caller should try again 'soon'
  */
-int rtc_set_ntp_time(struct timespec64 now)
+int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
 {
 	struct rtc_device *rtc;
 	struct rtc_time tm;
+	struct timespec64 to_set;
 	int err = -ENODEV;
-
-	if (now.tv_nsec < (NSEC_PER_SEC >> 1))
-		rtc_time64_to_tm(now.tv_sec, &tm);
-	else
-		rtc_time64_to_tm(now.tv_sec + 1, &tm);
+	bool ok;
 
 	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
-	if (rtc) {
-		/* rtc_hctosys exclusively uses UTC, so we call set_time here,
-		 * not set_mmss. */
-		if (rtc->ops &&
-		    (rtc->ops->set_time ||
-		     rtc->ops->set_mmss64 ||
-		     rtc->ops->set_mmss))
-			err = rtc_set_time(rtc, &tm);
-		rtc_class_close(rtc);
+	if (!rtc)
+		goto out_err;
+
+	if (!rtc->ops || (!rtc->ops->set_time && !rtc->ops->set_mmss64 &&
+			  !rtc->ops->set_mmss))
+		goto out_close;
+
+	/* Compute the value of tv_nsec we require the caller to supply in
+	 * now.tv_nsec.  This is the value such that (now +
+	 * set_offset_nsec).tv_nsec == 0.
+	 */
+	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
+	*target_nsec = to_set.tv_nsec;
+
+	/* The ntp code must call this with the correct value in tv_nsec, if
+	 * it does not we update target_nsec and return EPROTO to make the ntp
+	 * code try again later.
+	 */
+	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
+	if (!ok) {
+		err = -EPROTO;
+		goto out_close;
 	}
 
+	rtc_time64_to_tm(to_set.tv_sec, &tm);
+
+	/* rtc_hctosys exclusively uses UTC, so we call set_time here, not
+	 * set_mmss.
+	 */
+	err = rtc_set_time(rtc, &tm);
+
+out_close:
+	rtc_class_close(rtc);
+out_err:
 	return err;
 }
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index e6d0f9c1cafd..5b13fa029fd6 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -135,6 +135,14 @@ struct rtc_device {
 	/* Some hardware can't support UIE mode */
 	int uie_unsupported;
 
+	/* Number of nsec it takes to set the RTC clock. This influences when
+	 * the set ops are called. An offset:
+	 *   - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s
+	 *   - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s
+	 *   - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s
+	 */
+	long set_offset_nsec;
+
 	bool registered;
 
 	struct nvmem_config *nvmem_config;
@@ -172,7 +180,7 @@ extern void devm_rtc_device_unregister(struct device *dev,
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-extern int rtc_set_ntp_time(struct timespec64 now);
+extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
@@ -221,6 +229,39 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
+/* Determine if we can call to driver to set the time. Drivers can only be
+ * called to set a second aligned time value, and the field set_offset_nsec
+ * specifies how far away from the second aligned time to call the driver.
+ *
+ * This also computes 'to_set' which is the time we are trying to set, and has
+ * a zero in tv_nsecs, such that:
+ *    to_set - set_delay_nsec == now +/- FUZZ
+ *
+ */
+static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
+				  struct timespec64 *to_set,
+				  const struct timespec64 *now)
+{
+	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+	struct timespec64 delay = {.tv_sec = 0,
+				   .tv_nsec = set_offset_nsec};
+
+	*to_set = timespec64_add(*now, delay);
+
+	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+		to_set->tv_nsec = 0;
+		return true;
+	}
+
+	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+		to_set->tv_sec++;
+		to_set->tv_nsec = 0;
+		return true;
+	}
+	return false;
+}
+
 #define rtc_register_device(device) \
 	__rtc_register_device(THIS_MODULE, device)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index edf19cc53140..bc19de1a0683 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -492,6 +492,67 @@ out:
 	return leap;
 }
 
+static void sync_hw_clock(struct work_struct *work);
+static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
+
+static void sched_sync_hw_clock(struct timespec64 now,
+				unsigned long target_nsec, bool fail)
+
+{
+	struct timespec64 next;
+
+	getnstimeofday64(&next);
+	if (!fail)
+		next.tv_sec = 659;
+	else {
+		/*
+		 * Try again as soon as possible. Delaying long periods
+		 * decreases the accuracy of the work queue timer. Due to this
+		 * the algorithm is very likely to require a short-sleep retry
+		 * after the above long sleep to synchronize ts_nsec.
+		 */
+		next.tv_sec = 0;
+	}
+
+	/* Compute the needed delay that will get to tv_nsec == target_nsec */
+	next.tv_nsec = target_nsec - next.tv_nsec;
+	if (next.tv_nsec <= 0)
+		next.tv_nsec += NSEC_PER_SEC;
+	if (next.tv_nsec >= NSEC_PER_SEC) {
+		next.tv_sec++;
+		next.tv_nsec -= NSEC_PER_SEC;
+	}
+
+	queue_delayed_work(system_power_efficient_wq, &sync_work,
+			   timespec64_to_jiffies(&next));
+}
+
+static void sync_rtc_clock(void)
+{
+	unsigned long target_nsec;
+	struct timespec64 adjust, now;
+	int rc;
+
+	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
+		return;
+
+	getnstimeofday64(&now);
+
+	adjust = now;
+	if (persistent_clock_is_local)
+		adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+	/*
+	 * The current RTC in use will provide the target_nsec it wants to be
+	 * called at, and does rtc_tv_nsec_ok internally.
+	 */
+	rc = rtc_set_ntp_time(adjust, &target_nsec);
+	if (rc == -ENODEV)
+		return;
+
+	sched_sync_hw_clock(now, target_nsec, rc);
+}
+
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 int __weak update_persistent_clock(struct timespec now)
 {
@@ -507,76 +568,75 @@ int __weak update_persistent_clock64(struct timespec64 now64)
 }
 #endif
 
-#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
-static void sync_cmos_clock(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
-
-static void sync_cmos_clock(struct work_struct *work)
+static bool sync_cmos_clock(void)
 {
+	static bool no_cmos;
 	struct timespec64 now;
-	struct timespec64 next;
-	int fail = 1;
+	struct timespec64 adjust;
+	int rc = -EPROTO;
+	long target_nsec = NSEC_PER_SEC / 2;
+
+	if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
+		return false;
+
+	if (no_cmos)
+		return false;
 
 	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
-	 * We want the clock to be within a couple of ticks from the target.
+	 * Historically update_persistent_clock64() has followed x86
+	 * semantics, which match the MC146818A/etc RTC. This RTC will store
+	 * 'adjust' and then in .5s it will advance once second.
+	 *
+	 * Architectures are strongly encouraged to use rtclib and not
+	 * implement this legacy API.
 	 */
-	if (!ntp_synced()) {
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
-		return;
-	}
-
 	getnstimeofday64(&now);
-	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
-		struct timespec64 adjust = now;
-
-		fail = -ENODEV;
+	if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-		fail = update_persistent_clock64(adjust);
-#endif
-
-#ifdef CONFIG_RTC_SYSTOHC
-		if (fail == -ENODEV)
-			fail = rtc_set_ntp_time(adjust);
-#endif
+		rc = update_persistent_clock64(adjust);
+		/*
+		 * The machine does not support update_persistent_clock64 even
+		 * though it defines CONFIG_GENERIC_CMOS_UPDATE.
+		 */
+		if (rc == -ENODEV) {
+			no_cmos = true;
+			return false;
+		}
 	}
 
-	next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
-	if (next.tv_nsec <= 0)
-		next.tv_nsec += NSEC_PER_SEC;
+	sched_sync_hw_clock(now, target_nsec, rc);
+	return true;
+}
 
-	if (!fail || fail == -ENODEV)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
+/*
+ * If we have an externally synchronized Linux clock, then update RTC clock
+ * accordingly every ~11 minutes. Generally RTCs can only store second
+ * precision, but many RTCs will adjust the phase of their second tick to
+ * match the moment of update. This infrastructure arranges to call to the RTC
+ * set at the correct moment to phase synchronize the RTC second tick over
+ * with the kernel clock.
+ */
+static void sync_hw_clock(struct work_struct *work)
+{
+	if (!ntp_synced())
+		return;
 
-	if (next.tv_nsec >= NSEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_nsec -= NSEC_PER_SEC;
-	}
-	queue_delayed_work(system_power_efficient_wq,
-			   &sync_cmos_work, timespec64_to_jiffies(&next));
+	if (sync_cmos_clock())
+		return;
+
+	sync_rtc_clock();
 }
 
 void ntp_notify_cmos_timer(void)
 {
-	queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
-}
-
-#else
-void ntp_notify_cmos_timer(void) { }
-#endif
+	if (!ntp_synced())
+		return;
 
+	if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
+	    IS_ENABLED(CONFIG_RTC_SYSTOHC))
+		queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+}
 
 /*
  * Propagate a new txc->status value into the NTP state:
-- 
cgit v1.3-14-g43fede


From e0956dcc4ba74ec4b17e32fc9a156fcba1ef6610 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:44 +0200
Subject: timekeeping: Consolidate timekeeping_inject_offset code

The code to check the adjtimex() or clock_adjtime() arguments is spread
out across multiple files for presumably only historic reasons. As a
preparatation for a rework to get rid of the use of 'struct timeval'
and 'struct timespec' in there, this moves all the portions into
kernel/time/timekeeping.c and marks them as 'static'.

The warp_clock() function here is not as closely related as the others,
but I feel it still makes sense to move it here in order to consolidate
all callers of timekeeping_inject_offset().

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[jstultz: Whitespace fixup]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h       |  26 ----------
 kernel/time/ntp.c          |  61 ----------------------
 kernel/time/ntp_internal.h |   1 -
 kernel/time/time.c         |  36 +------------
 kernel/time/timekeeping.c  | 123 ++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/timekeeping.h  |   2 +-
 6 files changed, 123 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 9bc1f945777c..c0fbad08448f 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -134,32 +134,6 @@ static inline bool timeval_valid(const struct timeval *tv)
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
-/*
- * Validates if a timespec/timeval used to inject a time offset is valid.
- * Offsets can be postive or negative. The value of the timeval/timespec
- * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must
- * always be non-negative.
- */
-static inline bool timeval_inject_offset_valid(const struct timeval *tv)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more microseconds then a second */
-	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
-		return false;
-	return true;
-}
-
-static inline bool timespec_inject_offset_valid(const struct timespec *ts)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more nanoseconds then a second */
-	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
-		return false;
-	return true;
-}
-
 /* Some architectures do not supply their own clocksource.
  * This is mainly the case in architectures that get their
  * inter-tick times by reading the counter on their interval
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bc19de1a0683..90f84582a076 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -713,67 +713,6 @@ static inline void process_adjtimex_modes(struct timex *txc,
 }
 
 
-
-/**
- * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
- */
-int ntp_validate_timex(struct timex *txc)
-{
-	if (txc->modes & ADJ_ADJTIME) {
-		/* singleshot must not be used with any other mode bits */
-		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
-			return -EINVAL;
-		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
-		    !capable(CAP_SYS_TIME))
-			return -EPERM;
-	} else {
-		/* In order to modify anything, you gotta be super-user! */
-		 if (txc->modes && !capable(CAP_SYS_TIME))
-			return -EPERM;
-		/*
-		 * if the quartz is off by more than 10% then
-		 * something is VERY wrong!
-		 */
-		if (txc->modes & ADJ_TICK &&
-		    (txc->tick <  900000/USER_HZ ||
-		     txc->tick > 1100000/USER_HZ))
-			return -EINVAL;
-	}
-
-	if (txc->modes & ADJ_SETOFFSET) {
-		/* In order to inject time, you gotta be super-user! */
-		if (!capable(CAP_SYS_TIME))
-			return -EPERM;
-
-		if (txc->modes & ADJ_NANO) {
-			struct timespec ts;
-
-			ts.tv_sec = txc->time.tv_sec;
-			ts.tv_nsec = txc->time.tv_usec;
-			if (!timespec_inject_offset_valid(&ts))
-				return -EINVAL;
-
-		} else {
-			if (!timeval_inject_offset_valid(&txc->time))
-				return -EINVAL;
-		}
-	}
-
-	/*
-	 * Check for potential multiplication overflows that can
-	 * only happen on 64-bit systems:
-	 */
-	if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
-		if (LLONG_MIN / PPM_SCALE > txc->freq)
-			return -EINVAL;
-		if (LLONG_MAX / PPM_SCALE < txc->freq)
-			return -EINVAL;
-	}
-
-	return 0;
-}
-
-
 /*
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index d8a7c11fa71a..74b52cd48209 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -7,7 +7,6 @@ extern void ntp_clear(void);
 extern u64 ntp_tick_length(void);
 extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(time64_t secs);
-extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
 extern void __hardpps(const struct timespec64 *, const struct timespec64 *);
 #endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 44a8c1402133..04684e294f00 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -157,40 +157,6 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
 	return 0;
 }
 
-/*
- * Indicates if there is an offset between the system clock and the hardware
- * clock/persistent clock/rtc.
- */
-int persistent_clock_is_local;
-
-/*
- * Adjust the time obtained from the CMOS to be UTC time instead of
- * local time.
- *
- * This is ugly, but preferable to the alternatives.  Otherwise we
- * would either need to write a program to do it in /etc/rc (and risk
- * confusion if the program gets run more than once; it would also be
- * hard to make the program warp the clock precisely n hours)  or
- * compile in the timezone information into the kernel.  Bad, bad....
- *
- *						- TYT, 1992-01-01
- *
- * The best thing to do is to keep the CMOS clock in universal time (UTC)
- * as real UNIX machines always do it. This avoids all headaches about
- * daylight saving times and warping kernel clocks.
- */
-static inline void warp_clock(void)
-{
-	if (sys_tz.tz_minuteswest != 0) {
-		struct timespec adjust;
-
-		persistent_clock_is_local = 1;
-		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
-		adjust.tv_nsec = 0;
-		timekeeping_inject_offset(&adjust);
-	}
-}
-
 /*
  * In case for some reason the CMOS clock has not already been running
  * in UTC, but in some local time: The first time we set the timezone,
@@ -224,7 +190,7 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz
 		if (firsttime) {
 			firsttime = 0;
 			if (!tv)
-				warp_clock();
+				timekeeping_warp_clock();
 		}
 	}
 	if (tv)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..7d8e0e842484 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1258,13 +1258,39 @@ out:
 }
 EXPORT_SYMBOL(do_settimeofday64);
 
+/*
+ * Validates if a timespec/timeval used to inject a time offset is valid.
+ * Offsets can be postive or negative. The value of the timeval/timespec
+ * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must
+ * always be non-negative.
+ */
+static inline bool timeval_inject_offset_valid(const struct timeval *tv)
+{
+	/* We don't check the tv_sec as it can be positive or negative */
+
+	/* Can't have more microseconds then a second */
+	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
+		return false;
+	return true;
+}
+
+static inline bool timespec_inject_offset_valid(const struct timespec *ts)
+{
+	/* We don't check the tv_sec as it can be positive or negative */
+
+	/* Can't have more nanoseconds then a second */
+	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
+		return false;
+	return true;
+}
+
 /**
  * timekeeping_inject_offset - Adds or subtracts from the current time.
  * @tv:		pointer to the timespec variable containing the offset
  *
  * Adds or subtracts an offset value from the current time.
  */
-int timekeeping_inject_offset(struct timespec *ts)
+static int timekeeping_inject_offset(struct timespec *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
@@ -1303,7 +1329,40 @@ error: /* even if we error out, we forwarded the time, so call update */
 
 	return ret;
 }
-EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/*
+ * Indicates if there is an offset between the system clock and the hardware
+ * clock/persistent clock/rtc.
+ */
+int persistent_clock_is_local;
+
+/*
+ * Adjust the time obtained from the CMOS to be UTC time instead of
+ * local time.
+ *
+ * This is ugly, but preferable to the alternatives.  Otherwise we
+ * would either need to write a program to do it in /etc/rc (and risk
+ * confusion if the program gets run more than once; it would also be
+ * hard to make the program warp the clock precisely n hours)  or
+ * compile in the timezone information into the kernel.  Bad, bad....
+ *
+ *						- TYT, 1992-01-01
+ *
+ * The best thing to do is to keep the CMOS clock in universal time (UTC)
+ * as real UNIX machines always do it. This avoids all headaches about
+ * daylight saving times and warping kernel clocks.
+ */
+void timekeeping_warp_clock(void)
+{
+	if (sys_tz.tz_minuteswest != 0) {
+		struct timespec adjust;
+
+		persistent_clock_is_local = 1;
+		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+		adjust.tv_nsec = 0;
+		timekeeping_inject_offset(&adjust);
+	}
+}
 
 /**
  * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
@@ -2247,6 +2306,66 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 	return base;
 }
 
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ */
+static int ntp_validate_timex(struct timex *txc)
+{
+	if (txc->modes & ADJ_ADJTIME) {
+		/* singleshot must not be used with any other mode bits */
+		if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
+			return -EINVAL;
+		if (!(txc->modes & ADJ_OFFSET_READONLY) &&
+		    !capable(CAP_SYS_TIME))
+			return -EPERM;
+	} else {
+		/* In order to modify anything, you gotta be super-user! */
+		if (txc->modes && !capable(CAP_SYS_TIME))
+			return -EPERM;
+		/*
+		 * if the quartz is off by more than 10% then
+		 * something is VERY wrong!
+		 */
+		if (txc->modes & ADJ_TICK &&
+		    (txc->tick <  900000/USER_HZ ||
+		     txc->tick > 1100000/USER_HZ))
+			return -EINVAL;
+	}
+
+	if (txc->modes & ADJ_SETOFFSET) {
+		/* In order to inject time, you gotta be super-user! */
+		if (!capable(CAP_SYS_TIME))
+			return -EPERM;
+
+		if (txc->modes & ADJ_NANO) {
+			struct timespec ts;
+
+			ts.tv_sec = txc->time.tv_sec;
+			ts.tv_nsec = txc->time.tv_usec;
+			if (!timespec_inject_offset_valid(&ts))
+				return -EINVAL;
+
+		} else {
+			if (!timeval_inject_offset_valid(&txc->time))
+				return -EINVAL;
+		}
+	}
+
+	/*
+	 * Check for potential multiplication overflows that can
+	 * only happen on 64-bit systems:
+	 */
+	if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
+		if (LLONG_MIN / PPM_SCALE > txc->freq)
+			return -EINVAL;
+		if (LLONG_MAX / PPM_SCALE < txc->freq)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
  */
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index d0914676d4c5..44aec7893cdd 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -10,7 +10,7 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
-extern int timekeeping_inject_offset(struct timespec *ts);
+extern void timekeeping_warp_clock(void);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
-- 
cgit v1.3-14-g43fede


From 1572fa03784831b81ec26ec379374cf6bdec04fb Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:45 +0200
Subject: timekeeping: Use timespec64 in timekeeping_inject_offset

As part of changing all the timekeeping code to use 64-bit
time_t consistently, this removes the uses of timeval
and timespec as much as possible from do_adjtimex() and
timekeeping_inject_offset(). The timeval_inject_offset_valid()
and timespec_inject_offset_valid() just complicate this,
so I'm folding them into the respective callers.

This leaves the actual 'struct timex' definition, which
is part of the user-space ABI and should be dealt with
separately when we have agreed on the ABI change.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 72 ++++++++++++++++-------------------------------
 1 file changed, 25 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7d8e0e842484..c6a35fb3cf76 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1258,65 +1258,37 @@ out:
 }
 EXPORT_SYMBOL(do_settimeofday64);
 
-/*
- * Validates if a timespec/timeval used to inject a time offset is valid.
- * Offsets can be postive or negative. The value of the timeval/timespec
- * is the sum of its fields, but *NOTE*: the field tv_usec/tv_nsec must
- * always be non-negative.
- */
-static inline bool timeval_inject_offset_valid(const struct timeval *tv)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more microseconds then a second */
-	if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC)
-		return false;
-	return true;
-}
-
-static inline bool timespec_inject_offset_valid(const struct timespec *ts)
-{
-	/* We don't check the tv_sec as it can be positive or negative */
-
-	/* Can't have more nanoseconds then a second */
-	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
-		return false;
-	return true;
-}
-
 /**
  * timekeeping_inject_offset - Adds or subtracts from the current time.
  * @tv:		pointer to the timespec variable containing the offset
  *
  * Adds or subtracts an offset value from the current time.
  */
-static int timekeeping_inject_offset(struct timespec *ts)
+static int timekeeping_inject_offset(struct timespec64 *ts)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned long flags;
-	struct timespec64 ts64, tmp;
+	struct timespec64 tmp;
 	int ret = 0;
 
-	if (!timespec_inject_offset_valid(ts))
+	if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	ts64 = timespec_to_timespec64(*ts);
-
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&tk_core.seq);
 
 	timekeeping_forward_now(tk);
 
 	/* Make sure the proposed value is valid */
-	tmp = timespec64_add(tk_xtime(tk),  ts64);
-	if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
+	tmp = timespec64_add(tk_xtime(tk), *ts);
+	if (timespec64_compare(&tk->wall_to_monotonic, ts) > 0 ||
 	    !timespec64_valid_strict(&tmp)) {
 		ret = -EINVAL;
 		goto error;
 	}
 
-	tk_xtime_add(tk, &ts64);
-	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts64));
+	tk_xtime_add(tk, ts);
+	tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *ts));
 
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1355,7 +1327,7 @@ int persistent_clock_is_local;
 void timekeeping_warp_clock(void)
 {
 	if (sys_tz.tz_minuteswest != 0) {
-		struct timespec adjust;
+		struct timespec64 adjust;
 
 		persistent_clock_is_local = 1;
 		adjust.tv_sec = sys_tz.tz_minuteswest * 60;
@@ -2307,9 +2279,9 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 }
 
 /**
- * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
+ * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
  */
-static int ntp_validate_timex(struct timex *txc)
+static int timekeeping_validate_timex(struct timex *txc)
 {
 	if (txc->modes & ADJ_ADJTIME) {
 		/* singleshot must not be used with any other mode bits */
@@ -2337,16 +2309,22 @@ static int ntp_validate_timex(struct timex *txc)
 		if (!capable(CAP_SYS_TIME))
 			return -EPERM;
 
-		if (txc->modes & ADJ_NANO) {
-			struct timespec ts;
+		/*
+		 * Validate if a timespec/timeval used to inject a time
+		 * offset is valid.  Offsets can be postive or negative, so
+		 * we don't check tv_sec. The value of the timeval/timespec
+		 * is the sum of its fields,but *NOTE*:
+		 * The field tv_usec/tv_nsec must always be non-negative and
+		 * we can't have more nanoseconds/microseconds than a second.
+		 */
+		if (txc->time.tv_usec < 0)
+			return -EINVAL;
 
-			ts.tv_sec = txc->time.tv_sec;
-			ts.tv_nsec = txc->time.tv_usec;
-			if (!timespec_inject_offset_valid(&ts))
+		if (txc->modes & ADJ_NANO) {
+			if (txc->time.tv_usec >= NSEC_PER_SEC)
 				return -EINVAL;
-
 		} else {
-			if (!timeval_inject_offset_valid(&txc->time))
+			if (txc->time.tv_usec >= USEC_PER_SEC)
 				return -EINVAL;
 		}
 	}
@@ -2378,12 +2356,12 @@ int do_adjtimex(struct timex *txc)
 	int ret;
 
 	/* Validate the data before disabling interrupts */
-	ret = ntp_validate_timex(txc);
+	ret = timekeeping_validate_timex(txc);
 	if (ret)
 		return ret;
 
 	if (txc->modes & ADJ_SETOFFSET) {
-		struct timespec delta;
+		struct timespec64 delta;
 		delta.tv_sec  = txc->time.tv_sec;
 		delta.tv_nsec = txc->time.tv_usec;
 		if (!(txc->modes & ADJ_NANO))
-- 
cgit v1.3-14-g43fede


From 85bf19e7df2479140eff2348a4e6a9c19b5c3960 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:46 +0200
Subject: time: Remove unused functions

The (slow but) ongoing work on conversion from timespec to timespec64
has led some timespec based helper functions to become unused.

No new code should use them, so we can remove the functions entirely.
I'm planning to obsolete additional interfaces next and remove
more of these.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h   | 18 ------------------
 include/linux/time64.h | 28 ----------------------------
 kernel/time/time.c     | 18 ------------------
 3 files changed, 64 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index c0fbad08448f..0e8a80918484 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -39,15 +39,6 @@ static inline int timespec_compare(const struct timespec *lhs, const struct time
 	return lhs->tv_nsec - rhs->tv_nsec;
 }
 
-static inline int timeval_compare(const struct timeval *lhs, const struct timeval *rhs)
-{
-	if (lhs->tv_sec < rhs->tv_sec)
-		return -1;
-	if (lhs->tv_sec > rhs->tv_sec)
-		return 1;
-	return lhs->tv_usec - rhs->tv_usec;
-}
-
 extern time64_t mktime64(const unsigned int year, const unsigned int mon,
 			const unsigned int day, const unsigned int hour,
 			const unsigned int min, const unsigned int sec);
@@ -65,15 +56,6 @@ static inline unsigned long mktime(const unsigned int year,
 
 extern void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec);
 
-/*
- * timespec_add_safe assumes both values are positive and checks
- * for overflow. It will return TIME_T_MAX if the reutrn would be
- * smaller then either of the arguments.
- */
-extern struct timespec timespec_add_safe(const struct timespec lhs,
-					 const struct timespec rhs);
-
-
 static inline struct timespec timespec_add(struct timespec lhs,
 						struct timespec rhs)
 {
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 980c71b3001a..402b595c76d2 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -53,16 +53,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ts;
 }
 
-static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
-{
-	return *its64;
-}
-
-static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
-{
-	return *its;
-}
-
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
@@ -94,24 +84,6 @@ static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
 	return ret;
 }
 
-static inline struct itimerspec itimerspec64_to_itimerspec(struct itimerspec64 *its64)
-{
-	struct itimerspec ret;
-
-	ret.it_interval = timespec64_to_timespec(its64->it_interval);
-	ret.it_value = timespec64_to_timespec(its64->it_value);
-	return ret;
-}
-
-static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *its)
-{
-	struct itimerspec64 ret;
-
-	ret.it_interval = timespec_to_timespec64(its->it_interval);
-	ret.it_value = timespec_to_timespec64(its->it_value);
-	return ret;
-}
-
 static inline int timespec64_equal(const struct timespec64 *a,
 				   const struct timespec64 *b)
 {
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 04684e294f00..947fb614c78f 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -818,24 +818,6 @@ unsigned long nsecs_to_jiffies(u64 n)
 }
 EXPORT_SYMBOL_GPL(nsecs_to_jiffies);
 
-/*
- * Add two timespec values and do a safety check for overflow.
- * It's assumed that both values are valid (>= 0)
- */
-struct timespec timespec_add_safe(const struct timespec lhs,
-				  const struct timespec rhs)
-{
-	struct timespec res;
-
-	set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
-				lhs.tv_nsec + rhs.tv_nsec);
-
-	if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
-		res.tv_sec = TIME_T_MAX;
-
-	return res;
-}
-
 /*
  * Add two timespec64 values and do a safety check for overflow.
  * It's assumed that both values are valid (>= 0).
-- 
cgit v1.3-14-g43fede


From abc8f96e3eb846fcf6333395ee1f6ed4a734576c Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 19 Oct 2017 13:14:48 +0200
Subject: time: Move time_t conversion helpers to time32.h

On 64-bit architectures, the timespec64 based helpers in linux/time.h
are defined as macros pointing to their timespec based counterparts.
This made sense when they were first introduced, but as we are migrating
away from timespec in general, it's much less intuitive now.

This changes the macros to work in the exact opposite way: we always
provide the timespec64 based helpers and define the old interfaces as
macros for them. Now we can move those macros into linux/time32.h, which
already contains the respective helpers for 32-bit architectures.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time32.h | 45 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/time64.h | 50 +-------------------------------------------------
 kernel/time/time.c     |  5 +++--
 3 files changed, 49 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time32.h b/include/linux/time32.h
index 9b9c43f0d39b..65b1de25198d 100644
--- a/include/linux/time32.h
+++ b/include/linux/time32.h
@@ -13,6 +13,49 @@
 
 #define TIME_T_MAX	(time_t)((1UL << ((sizeof(time_t) << 3) - 1)) - 1)
 
+#if __BITS_PER_LONG == 64
+
+/* timespec64 is defined as timespec here */
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	return ts64;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	return ts;
+}
+
+# define timespec_equal			timespec64_equal
+# define timespec_compare		timespec64_compare
+# define set_normalized_timespec	set_normalized_timespec64
+# define timespec_add			timespec64_add
+# define timespec_sub			timespec64_sub
+# define timespec_valid			timespec64_valid
+# define timespec_valid_strict		timespec64_valid_strict
+# define timespec_to_ns			timespec64_to_ns
+# define ns_to_timespec			ns_to_timespec64
+# define timespec_add_ns		timespec64_add_ns
+
+#else
+static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
+{
+	struct timespec ret;
+
+	ret.tv_sec = (time_t)ts64.tv_sec;
+	ret.tv_nsec = ts64.tv_nsec;
+	return ret;
+}
+
+static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
+{
+	struct timespec64 ret;
+
+	ret.tv_sec = ts.tv_sec;
+	ret.tv_nsec = ts.tv_nsec;
+	return ret;
+}
+
 static inline int timespec_equal(const struct timespec *a,
 				 const struct timespec *b)
 {
@@ -116,6 +159,8 @@ static __always_inline void timespec_add_ns(struct timespec *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
+#endif
+
 /**
  * time_to_tm - converts the calendar time to local broken-down time
  *
diff --git a/include/linux/time64.h b/include/linux/time64.h
index 402b595c76d2..ec1888cf5378 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -7,11 +7,8 @@
 typedef __s64 time64_t;
 typedef __u64 timeu64_t;
 
-/*
- * This wants to go into uapi/linux/time.h once we agreed about the
- * userspace interfaces.
- */
 #if __BITS_PER_LONG == 64
+/* this trick allows us to optimize out timespec64_to_timespec */
 # define timespec64 timespec
 #define itimerspec64 itimerspec
 #else
@@ -41,49 +38,6 @@ struct itimerspec64 {
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
 
-#if __BITS_PER_LONG == 64
-
-static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
-{
-	return ts64;
-}
-
-static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
-{
-	return ts;
-}
-
-# define timespec64_equal		timespec_equal
-# define timespec64_compare		timespec_compare
-# define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add			timespec_add
-# define timespec64_sub			timespec_sub
-# define timespec64_valid		timespec_valid
-# define timespec64_valid_strict	timespec_valid_strict
-# define timespec64_to_ns		timespec_to_ns
-# define ns_to_timespec64		ns_to_timespec
-# define timespec64_add_ns		timespec_add_ns
-
-#else
-
-static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64)
-{
-	struct timespec ret;
-
-	ret.tv_sec = (time_t)ts64.tv_sec;
-	ret.tv_nsec = ts64.tv_nsec;
-	return ret;
-}
-
-static inline struct timespec64 timespec_to_timespec64(const struct timespec ts)
-{
-	struct timespec64 ret;
-
-	ret.tv_sec = ts.tv_sec;
-	ret.tv_nsec = ts.tv_nsec;
-	return ret;
-}
-
 static inline int timespec64_equal(const struct timespec64 *a,
 				   const struct timespec64 *b)
 {
@@ -185,8 +139,6 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 	a->tv_nsec = ns;
 }
 
-#endif
-
 /*
  * timespec64_add_safe assumes both values are positive and checks for
  * overflow. It will return TIME64_MAX in case of overflow.
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 947fb614c78f..fe60ebd301cf 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -407,6 +407,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
 }
 EXPORT_SYMBOL(mktime64);
 
+#if __BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -467,6 +468,7 @@ struct timespec ns_to_timespec(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec);
+#endif
 
 /**
  * ns_to_timeval - Convert nanoseconds to timeval
@@ -486,7 +488,6 @@ struct timeval ns_to_timeval(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timeval);
 
-#if BITS_PER_LONG == 32
 /**
  * set_normalized_timespec - set timespec sec and nsec parts and normalize
  *
@@ -547,7 +548,7 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 	return ts;
 }
 EXPORT_SYMBOL(ns_to_timespec64);
-#endif
+
 /**
  * msecs_to_jiffies: - convert milliseconds to jiffies
  * @m:	time in milliseconds
-- 
cgit v1.3-14-g43fede


From 16c0890dc66d258fdeccf7b15a133f3930b19143 Mon Sep 17 00:00:00 2001
From: Byungchul Park <byungchul.park@lge.com>
Date: Tue, 31 Oct 2017 02:46:54 +0100
Subject: irq/work: Don't reinvent the wheel but use existing llist API

Use the proper llist APIs instead of open-coded variants of them.

Signed-off-by: Byungchul Park <byungchul.park@lge.com>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1509414414-14987-1-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq_work.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..e2ebe8c71e8f 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -138,11 +138,7 @@ static void irq_work_run_list(struct llist_head *list)
 		return;
 
 	llnode = llist_del_all(list);
-	while (llnode != NULL) {
-		work = llist_entry(llnode, struct irq_work, llnode);
-
-		llnode = llist_next(llnode);
-
+	llist_for_each_entry(work, llnode, llnode) {
 		/*
 		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
-- 
cgit v1.3-14-g43fede


From ab97f87325e28b7ef7717e6cb62e8da14a7176e1 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Fri, 20 Oct 2017 13:26:02 +0300
Subject: fsnotify: convert fsnotify_mark.refcnt from atomic_t to refcount_t

atomic_t variables are currently used to implement reference
counters with the following properties:
 - counter is initialized to 1 using atomic_set()
 - a resource is freed upon counter reaching zero
 - once counter reaches zero, its further
   increments aren't allowed
 - counter schema uses basic atomic operations
   (set, inc, inc_not_zero, dec_and_test, etc.)

Such atomic variables should be converted to a newly provided
refcount_t type and API that prevents accidental counter overflows
and underflows. This is important since overflows and underflows
can lead to use-after-free situation and be exploitable.

The variable fsnotify_mark.refcnt is used as pure reference counter.
Convert it to refcount_t and fix up the operations.

Suggested-by: Kees Cook <keescook@chromium.org>
Reviewed-by: David Windsor <dwindsor@gmail.com>
Reviewed-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/inotify/inotify_user.c |  4 ++--
 fs/notify/mark.c                 | 14 +++++++-------
 include/linux/fsnotify_backend.h |  2 +-
 kernel/audit_tree.c              |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 7cc7d3fb1862..d3c20e0bb046 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -376,7 +376,7 @@ static struct inotify_inode_mark *inotify_idr_find_locked(struct fsnotify_group
 
 		fsnotify_get_mark(fsn_mark);
 		/* One ref for being in the idr, one ref we just took */
-		BUG_ON(atomic_read(&fsn_mark->refcnt) < 2);
+		BUG_ON(refcount_read(&fsn_mark->refcnt) < 2);
 	}
 
 	return i_mark;
@@ -446,7 +446,7 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	 * One ref for being in the idr
 	 * one ref grabbed by inotify_idr_find
 	 */
-	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+	if (unlikely(refcount_read(&i_mark->fsn_mark.refcnt) < 2)) {
 		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
 			 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		/* we can't really recover with bad ref cnting.. */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index f3a32ea15b49..e9191b416434 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -105,8 +105,8 @@ static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
-	WARN_ON_ONCE(!atomic_read(&mark->refcnt));
-	atomic_inc(&mark->refcnt);
+	WARN_ON_ONCE(!refcount_read(&mark->refcnt));
+	refcount_inc(&mark->refcnt);
 }
 
 static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
@@ -201,7 +201,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 
 	/* Catch marks that were actually never attached to object */
 	if (!mark->connector) {
-		if (atomic_dec_and_test(&mark->refcnt))
+		if (refcount_dec_and_test(&mark->refcnt))
 			fsnotify_final_mark_destroy(mark);
 		return;
 	}
@@ -210,7 +210,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	 * We have to be careful so that traversals of obj_list under lock can
 	 * safely grab mark reference.
 	 */
-	if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+	if (!refcount_dec_and_lock(&mark->refcnt, &mark->connector->lock))
 		return;
 
 	conn = mark->connector;
@@ -258,7 +258,7 @@ static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
 	if (!mark)
 		return true;
 
-	if (atomic_inc_not_zero(&mark->refcnt)) {
+	if (refcount_inc_not_zero(&mark->refcnt)) {
 		spin_lock(&mark->lock);
 		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
 			/* mark is attached, group is still alive then */
@@ -335,7 +335,7 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 
 	WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
 	WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
-		     atomic_read(&mark->refcnt) < 1 +
+		     refcount_read(&mark->refcnt) < 1 +
 			!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
 
 	spin_lock(&mark->lock);
@@ -737,7 +737,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
-	atomic_set(&mark->refcnt, 1);
+	refcount_set(&mark->refcnt, 1);
 	fsnotify_get_group(group);
 	mark->group = group;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 744e2b9969fc..9bcb43953f4e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -242,7 +242,7 @@ struct fsnotify_mark {
 	__u32 mask;
 	/* We hold one for presence in g_list. Also one ref for each 'thing'
 	 * in kernel that found and may be using this mark. */
-	atomic_t refcnt;
+	refcount_t refcnt;
 	/* Group this mark is for. Set on mark creation, stable until last ref
 	 * is dropped */
 	struct fsnotify_group *group;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 011d46e5f73f..45ec960ad536 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1007,7 +1007,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	 * We are guaranteed to have at least one reference to the mark from
 	 * either the inode or the caller of fsnotify_destroy_mark().
 	 */
-	BUG_ON(atomic_read(&entry->refcnt) < 1);
+	BUG_ON(refcount_read(&entry->refcnt) < 1);
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
-- 
cgit v1.3-14-g43fede


From aa4bf44dc851c6bdd4f7b61b5f2c56c84dfe2ff0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:40 +0200
Subject: userns: use union in {g,u}idmap struct

- Add a struct containing two pointer to extents and wrap both the static extent
  array and the struct into a union. This is done in preparation for bumping the
  {g,u}idmap limits for user namespaces.
- Add brackets around anonymous union when using designated initializers to
  initialize members in order to please gcc <= 4.4.

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h | 18 +++++++++++++-----
 kernel/user.c                  | 30 ++++++++++++++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..7c83d7f6289b 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -12,13 +12,21 @@
 
 #define UID_GID_MAP_MAX_EXTENTS 5
 
+struct uid_gid_extent {
+	u32 first;
+	u32 lower_first;
+	u32 count;
+};
+
 struct uid_gid_map {	/* 64 bytes -- 1 cache line */
 	u32 nr_extents;
-	struct uid_gid_extent {
-		u32 first;
-		u32 lower_first;
-		u32 count;
-	} extent[UID_GID_MAP_MAX_EXTENTS];
+	union {
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct {
+			struct uid_gid_extent *forward;
+			struct uid_gid_extent *reverse;
+		};
+	};
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
 struct user_namespace init_user_ns = {
 	.uid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.gid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.projid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.count = ATOMIC_INIT(3),
-- 
cgit v1.3-14-g43fede


From 6397fac4915ab3002dc15aae751455da1a852f25 Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Wed, 25 Oct 2017 00:04:41 +0200
Subject: userns: bump idmap limits to 340

There are quite some use cases where users run into the current limit for
{g,u}id mappings. Consider a user requesting us to map everything but 999, and
1001 for a given range of 1000000000 with a sub{g,u}id layout of:

some-user:100000:1000000000
some-user:999:1
some-user:1000:1
some-user:1001:1
some-user:1002:1

This translates to:

MAPPING-TYPE | CONTAINER |    HOST |     RANGE |
-------------|-----------|---------|-----------|
         uid |       999 |     999 |         1 |
         uid |      1001 |    1001 |         1 |
         uid |         0 | 1000000 |       999 |
         uid |      1000 | 1001000 |         1 |
         uid |      1002 | 1001002 | 999998998 |
------------------------------------------------
         gid |       999 |     999 |         1 |
         gid |      1001 |    1001 |         1 |
         gid |         0 | 1000000 |       999 |
         gid |      1000 | 1001000 |         1 |
         gid |      1002 | 1001002 | 999998998 |

which is already the current limit.

As discussed at LPC simply bumping the number of limits is not going to work
since this would mean that struct uid_gid_map won't fit into a single cache-line
anymore thereby regressing performance for the base-cases. The same problem
seems to arise when using a single pointer. So the idea is to use

struct uid_gid_extent {
	u32 first;
	u32 lower_first;
	u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
	u32 nr_extents;
	union {
		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
		struct {
			struct uid_gid_extent *forward;
			struct uid_gid_extent *reverse;
		};
	};
};

For the base cases we will only use the struct uid_gid_extent extent member. If
we go over UID_GID_MAP_MAX_BASE_EXTENTS mappings we perform a single 4k
kmalloc() which means we can have a maximum of 340 mappings
(340 * size(struct uid_gid_extent) = 4080). For the latter case we use two
pointers "forward" and "reverse". The forward pointer points to an array sorted
by "first" and the reverse pointer points to an array sorted by "lower_first".
We can then perform binary search on those arrays.

Performance Testing:
When Eric introduced the extent-based struct uid_gid_map approach he measured
the performanc impact of his idmap changes:

> My benchmark consisted of going to single user mode where nothing else was
> running. On an ext4 filesystem opening 1,000,000 files and looping through all
> of the files 1000 times and calling fstat on the individuals files. This was
> to ensure I was benchmarking stat times where the inodes were in the kernels
> cache, but the inode values were not in the processors cache. My results:

> v3.4-rc1:         ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled)
> v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled)
> v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled)

I used an identical approach on my laptop. Here's a thorough description of what
I did. I built a 4.14.0-rc4 mainline kernel with my new idmap patches applied. I
booted into single user mode and used an ext4 filesystem to open/create
1,000,000 files. Then I looped through all of the files calling fstat() on each
of them 1000 times and calculated the mean fstat() time for a single file. (The
test program can be found below.)

Here are the results. For fun, I compared the first version of my patch which
scaled linearly with the new version of the patch:

|   # MAPPINGS |   PATCH-V1 | PATCH-NEW |
|--------------|------------|-----------|
|   0 mappings |     158 ns |   158 ns  |
|   1 mappings |     164 ns |   157 ns  |
|   2 mappings |     170 ns |   158 ns  |
|   3 mappings |     175 ns |   161 ns  |
|   5 mappings |     187 ns |   165 ns  |
|  10 mappings |     218 ns |   199 ns  |
|  50 mappings |     528 ns |   218 ns  |
| 100 mappings |     980 ns |   229 ns  |
| 200 mappings |    1880 ns |   239 ns  |
| 300 mappings |    2760 ns |   240 ns  |
| 340 mappings | not tested |   248 ns  |

Here's the test program I used. I asked Eric what he did and this is a more
"advanced" implementation of the idea. It's pretty straight-forward:

 #define __GNU_SOURCE
 #define __STDC_FORMAT_MACROS
 #include <errno.h>
 #include <dirent.h>
 #include <fcntl.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/types.h>

 int main(int argc, char *argv[])
 {
 	int ret;
 	size_t i, k;
 	int fd[1000000];
 	int times[1000];
 	char pathname[4096];
 	struct stat st;
 	struct timeval t1, t2;
 	uint64_t time_in_mcs;
 	uint64_t sum = 0;

 	if (argc != 2) {
 		fprintf(stderr, "Please specify a directory where to create "
 				"the test files\n");
 		exit(EXIT_FAILURE);
 	}

 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 		sprintf(pathname, "%s/idmap_test_%zu", argv[1], i);
 		fd[i]= open(pathname, O_RDWR | O_CREAT, S_IXUSR | S_IXGRP | S_IXOTH);
 		if (fd[i] < 0) {
 			ssize_t j;
 			for (j = i; j >= 0; j--)
 				close(fd[j]);
 			exit(EXIT_FAILURE);
 		}
 	}

 	for (k = 0; k < 1000; k++) {
 		ret = gettimeofday(&t1, NULL);
 		if (ret < 0)
 			goto close_all;

 		for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++) {
 			ret = fstat(fd[i], &st);
 			if (ret < 0)
 				goto close_all;
 		}

 		ret = gettimeofday(&t2, NULL);
 		if (ret < 0)
 			goto close_all;

 		time_in_mcs = (1000000 * t2.tv_sec + t2.tv_usec) -
 			      (1000000 * t1.tv_sec + t1.tv_usec);
 		printf("Total time in micro seconds:       %" PRIu64 "\n",
 		       time_in_mcs);
 		printf("Total time in nanoseconds:         %" PRIu64 "\n",
 		       time_in_mcs * 1000);
 		printf("Time per file in nanoseconds:      %" PRIu64 "\n",
 		       (time_in_mcs * 1000) / 1000000);
 		times[k] = (time_in_mcs * 1000) / 1000000;
 	}

 close_all:
 	for (i = 0; i < sizeof(fd) / sizeof(fd[0]); i++)
 		close(fd[i]);

 	if (ret < 0)
 		exit(EXIT_FAILURE);

 	for (k = 0; k < 1000; k++) {
 		sum += times[k];
 	}

 	printf("Mean time per file in nanoseconds: %" PRIu64 "\n", sum / 1000);

 	exit(EXIT_SUCCESS);;
 }

Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
CC: Serge Hallyn <serge@hallyn.com>
CC: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/user_namespace.h |   7 +-
 kernel/user_namespace.c        | 350 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 324 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 7c83d7f6289b..1c1046a60fb4 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -10,7 +10,8 @@
 #include <linux/sysctl.h>
 #include <linux/err.h>
 
-#define UID_GID_MAP_MAX_EXTENTS 5
+#define UID_GID_MAP_MAX_BASE_EXTENTS 5
+#define UID_GID_MAP_MAX_EXTENTS 340
 
 struct uid_gid_extent {
 	u32 first;
@@ -18,10 +19,10 @@ struct uid_gid_extent {
 	u32 count;
 };
 
-struct uid_gid_map {	/* 64 bytes -- 1 cache line */
+struct uid_gid_map { /* 64 bytes -- 1 cache line */
 	u32 nr_extents;
 	union {
-		struct uid_gid_extent extent[UID_GID_MAP_MAX_EXTENTS];
+		struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
 		struct {
 			struct uid_gid_extent *forward;
 			struct uid_gid_extent *reverse;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..5fd2d53dbc75 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
 #include <linux/ctype.h>
 #include <linux/projid.h>
 #include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
 	do {
 		struct ucounts *ucounts = ns->ucounts;
 		parent = ns->parent;
+		if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->gid_map.forward);
+			kfree(ns->gid_map.reverse);
+		}
+		if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->uid_map.forward);
+			kfree(ns->uid_map.reverse);
+		}
+		if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->projid_map.forward);
+			kfree(ns->projid_map.reverse);
+		}
 		retire_userns_sysctls(ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 		key_put(ns->persistent_keyring_register);
@@ -198,7 +212,84 @@ void __put_user_ns(struct user_namespace *ns)
 }
 EXPORT_SYMBOL(__put_user_ns);
 
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+	bool map_up; /* true  -> id from kid; false -> kid from id */
+	u32 id; /* id to find */
+	u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
+{
+	u32 first, last, id2;
+	const struct idmap_key *key = k;
+	const struct uid_gid_extent *el = e;
+
+	/* handle map_id_range_down() */
+	if (key->count)
+		id2 = key->id + key->count - 1;
+	else
+		id2 = key->id;
+
+	/* handle map_id_{down,up}() */
+	if (key->map_up)
+		first = el->lower_first;
+	else
+		first = el->first;
+
+	last = first + el->count - 1;
+
+	if (key->id >= first && key->id <= last &&
+	    (id2 >= first && id2 <= last))
+		return 0;
+
+	if (key->id < first || id2 < first)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = false;
+	key.count = count;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->forward, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count)
 {
 	unsigned idx, extents;
 	u32 first, last, id2;
@@ -224,7 +315,23 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-static u32 map_id_down(struct uid_gid_map *map, u32 id)
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_range_down_base(map, id, count);
+
+	return map_id_range_down_max(map, id, count);
+}
+
+/**
+ * map_id_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_down_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -247,7 +354,23 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
+static u32 map_id_down(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_down_base(map, id);
+
+	return map_id_range_down_max(map, id, 0);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_base(struct uid_gid_map *map, u32 id)
 {
 	unsigned idx, extents;
 	u32 first, last;
@@ -270,6 +393,45 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
+{
+	u32 extents;
+	struct uid_gid_extent *extent;
+	struct idmap_key key;
+
+	key.map_up = true;
+	key.count = 0;
+	key.id = id;
+
+	extents = map->nr_extents;
+	smp_rmb();
+
+	extent = bsearch(&key, map->reverse, extents,
+			 sizeof(struct uid_gid_extent), cmp_map_id);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->lower_first) + extent->first;
+	else
+		id = (u32) -1;
+
+	return id;
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	u32 extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return map_id_up_base(map, id);
+
+	return map_id_up_max(map, id);
+}
+
 /**
  *	make_kuid - Map a user-namespace uid pair into a kuid.
  *	@ns:  User namespace that the uid is in
@@ -540,13 +702,15 @@ static int projid_m_show(struct seq_file *seq, void *v)
 static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
-	struct uid_gid_extent *extent = NULL;
 	loff_t pos = *ppos;
 
-	if (pos < map->nr_extents)
-		extent = &map->extent[pos];
+	if (pos >= map->nr_extents)
+		return NULL;
+
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return &map->extent[pos];
 
-	return extent;
+	return &map->forward[pos];
 }
 
 static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +782,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 		u32 prev_upper_last, prev_lower_last;
 		struct uid_gid_extent *prev;
 
-		prev = &new_map->extent[idx];
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			prev = &new_map->extent[idx];
+		else
+			prev = &new_map->forward[idx];
 
 		prev_upper_first = prev->first;
 		prev_lower_first = prev->lower_first;
@@ -638,6 +805,102 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 	return false;
 }
 
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) {
+		map->extent[map->nr_extents].first = extent->first;
+		map->extent[map->nr_extents].lower_first = extent->lower_first;
+		map->extent[map->nr_extents].count = extent->count;
+		return 0;
+	}
+
+	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+		struct uid_gid_extent *forward;
+
+		/* Allocate memory for 340 mappings. */
+		forward = kmalloc(sizeof(struct uid_gid_extent) *
+				 UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+		if (!forward)
+			return -ENOMEM;
+
+		/* Copy over memory. Only set up memory for the forward pointer.
+		 * Defer the memory setup for the reverse pointer.
+		 */
+		memcpy(forward, map->extent,
+		       map->nr_extents * sizeof(map->extent[0]));
+
+		map->forward = forward;
+		map->reverse = NULL;
+	}
+
+	map->forward[map->nr_extents].first = extent->first;
+	map->forward[map->nr_extents].lower_first = extent->lower_first;
+	map->forward[map->nr_extents].count = extent->count;
+	return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->first < e2->first)
+		return -1;
+
+	if (e1->first > e2->first)
+		return 1;
+
+	return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->lower_first < e2->lower_first)
+		return -1;
+
+	if (e1->lower_first > e2->lower_first)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return 0;
+
+	/* Sort forward array. */
+	sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_forward, NULL);
+
+	/* Only copy the memory from forward we actually need. */
+	map->reverse = kmemdup(map->forward,
+			       map->nr_extents * sizeof(struct uid_gid_extent),
+			       GFP_KERNEL);
+	if (!map->reverse)
+		return -ENOMEM;
+
+	/* Sort reverse array. */
+	sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_reverse, NULL);
+
+	return 0;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -648,7 +911,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct user_namespace *ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
-	struct uid_gid_extent *extent = NULL;
+	struct uid_gid_extent extent;
 	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
@@ -673,6 +936,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 */
 	mutex_lock(&userns_state_mutex);
 
+	memset(&new_map, 0, sizeof(struct uid_gid_map));
+
 	ret = -EPERM;
 	/* Only allow one successful write to the map */
 	if (map->nr_extents != 0)
@@ -700,9 +965,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/* Parse the user data */
 	ret = -EINVAL;
 	pos = kbuf;
-	new_map.nr_extents = 0;
 	for (; pos; pos = next_line) {
-		extent = &new_map.extent[new_map.nr_extents];
 
 		/* Find the end of line and ensure I don't look past it */
 		next_line = strchr(pos, '\n');
@@ -714,17 +977,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		}
 
 		pos = skip_spaces(pos);
-		extent->first = simple_strtoul(pos, &pos, 10);
+		extent.first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->lower_first = simple_strtoul(pos, &pos, 10);
+		extent.lower_first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->count = simple_strtoul(pos, &pos, 10);
+		extent.count = simple_strtoul(pos, &pos, 10);
 		if (*pos && !isspace(*pos))
 			goto out;
 
@@ -734,29 +997,33 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			goto out;
 
 		/* Verify we have been given valid starting values */
-		if ((extent->first == (u32) -1) ||
-		    (extent->lower_first == (u32) -1))
+		if ((extent.first == (u32) -1) ||
+		    (extent.lower_first == (u32) -1))
 			goto out;
 
 		/* Verify count is not zero and does not cause the
 		 * extent to wrap
 		 */
-		if ((extent->first + extent->count) <= extent->first)
+		if ((extent.first + extent.count) <= extent.first)
 			goto out;
-		if ((extent->lower_first + extent->count) <=
-		     extent->lower_first)
+		if ((extent.lower_first + extent.count) <=
+		     extent.lower_first)
 			goto out;
 
 		/* Do the ranges in extent overlap any previous extents? */
-		if (mappings_overlap(&new_map, extent))
+		if (mappings_overlap(&new_map, &extent))
 			goto out;
 
-		new_map.nr_extents++;
-
-		/* Fail if the file contains too many extents */
-		if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+		if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
 		    (next_line != NULL))
 			goto out;
+
+		ret = insert_extent(&new_map, &extent);
+		if (ret < 0)
+			goto out;
+		ret = -EINVAL;
+
+		new_map.nr_extents++;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
@@ -767,16 +1034,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
+	ret = sort_idmaps(&new_map);
+	if (ret < 0)
+		goto out;
+
+	ret = -EPERM;
 	/* Map the lower ids from the parent user namespace to the
 	 * kernel global id space.
 	 */
 	for (idx = 0; idx < new_map.nr_extents; idx++) {
+		struct uid_gid_extent *e;
 		u32 lower_first;
-		extent = &new_map.extent[idx];
+
+		if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			e = &new_map.extent[idx];
+		else
+			e = &new_map.forward[idx];
 
 		lower_first = map_id_range_down(parent_map,
-						extent->lower_first,
-						extent->count);
+						e->lower_first,
+						e->count);
 
 		/* Fail if we can not map the specified extent to
 		 * the kernel global id space.
@@ -784,18 +1061,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (lower_first == (u32) -1)
 			goto out;
 
-		extent->lower_first = lower_first;
+		e->lower_first = lower_first;
 	}
 
 	/* Install the map */
-	memcpy(map->extent, new_map.extent,
-		new_map.nr_extents*sizeof(new_map.extent[0]));
+	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		memcpy(map->extent, new_map.extent,
+		       new_map.nr_extents * sizeof(new_map.extent[0]));
+	} else {
+		map->forward = new_map.forward;
+		map->reverse = new_map.reverse;
+	}
 	smp_wmb();
 	map->nr_extents = new_map.nr_extents;
 
 	*ppos = count;
 	ret = count;
 out:
+	if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(new_map.forward);
+		kfree(new_map.reverse);
+		map->forward = NULL;
+		map->reverse = NULL;
+		map->nr_extents = 0;
+	}
+
 	mutex_unlock(&userns_state_mutex);
 	kfree(kbuf);
 	return ret;
-- 
cgit v1.3-14-g43fede


From 11a8b9270e16e36d5fb607ba4b60db2958b7c625 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 15:54:32 -0500
Subject: userns: Don't special case a count of 0

We can always use a count of 1 so there is no reason to have
a special case of a count of 0.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 5fd2d53dbc75..c9904ee084c4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -232,11 +232,7 @@ static int cmp_map_id(const void *k, const void *e)
 	const struct idmap_key *key = k;
 	const struct uid_gid_extent *el = e;
 
-	/* handle map_id_range_down() */
-	if (key->count)
-		id2 = key->id + key->count - 1;
-	else
-		id2 = key->id;
+	id2 = key->id + key->count - 1;
 
 	/* handle map_id_{down,up}() */
 	if (key->map_up)
@@ -362,7 +358,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
 		return map_id_down_base(map, id);
 
-	return map_id_range_down_max(map, id, 0);
+	return map_id_range_down_max(map, id, 1);
 }
 
 /**
@@ -404,7 +400,7 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
 	struct idmap_key key;
 
 	key.map_up = true;
-	key.count = 0;
+	key.count = 1;
 	key.id = id;
 
 	extents = map->nr_extents;
-- 
cgit v1.3-14-g43fede


From 3edf652fa16562fb57a5a4b996ba72e2d7cdc38b Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 16:27:29 -0500
Subject: userns: Simplify the user and group mapping functions

Consolidate reading the number of extents and computing the return
value in the map_id_down, map_id_range_down and map_id_range.

This removal of one read of extents makes one smp_rmb unnecessary
and makes the code safe it is executed during the map write.  Reading
the number of extents twice and depending on the result being the same
is not safe, as it could be 0 the first time and > 5 the second time,
which would lead to misinterpreting the union fields.

The consolidation of the return value just removes a duplicate
caluculation which should make it easier to understand and maintain
the code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 132 +++++++++++++++++++++---------------------------
 1 file changed, 58 insertions(+), 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c9904ee084c4..563a2981d7c7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -256,28 +256,17 @@ static int cmp_map_id(const void *k, const void *e)
  * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
+static struct uid_gid_extent *
+map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
 {
-	u32 extents;
-	struct uid_gid_extent *extent;
 	struct idmap_key key;
 
 	key.map_up = false;
 	key.count = count;
 	key.id = id;
 
-	extents = map->nr_extents;
-	smp_rmb();
-
-	extent = bsearch(&key, map->forward, extents,
-			 sizeof(struct uid_gid_extent), cmp_map_id);
-	/* Map the id or note failure */
-	if (extent)
-		id = (id - extent->first) + extent->lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return bsearch(&key, map->forward, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
 /**
@@ -285,41 +274,43 @@ static u32 map_id_range_down_max(struct uid_gid_map *map, u32 id, u32 count)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_range_down_base(struct uid_gid_map *map, u32 id, u32 count)
+static struct uid_gid_extent *
+map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last, id2;
 
 	id2 = id + count - 1;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last &&
 		    (id2 >= first && id2 <= last))
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 {
-	u32 extents = map->nr_extents;
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_range_down_base(map, id, count);
+		extent = map_id_range_down_base(extents, map, id, count);
+	else
+		extent = map_id_range_down_max(extents, map, id, count);
 
-	return map_id_range_down_max(map, id, count);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
 }
 
 /**
@@ -327,38 +318,40 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_down_base(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last)
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-	u32 extents = map->nr_extents;
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
 	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_down_base(map, id);
+		extent = map_id_down_base(extents, map, id);
+	else
+		extent = map_id_range_down_max(extents, map, id, 1);
 
-	return map_id_range_down_max(map, id, 1);
+	/* Map the id or note failure */
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
+	else
+		id = (u32) -1;
+
+	return id;
 }
 
 /**
@@ -366,48 +359,50 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
  * Can only be called if number of mappings is equal or less than
  * UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_up_base(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
+	unsigned idx;
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last)
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return NULL;
 }
 
 /**
  * map_id_up_max - Find idmap via binary search in ordered idmap array.
  * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
  */
-static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
+static struct uid_gid_extent *
+map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
 {
-	u32 extents;
-	struct uid_gid_extent *extent;
 	struct idmap_key key;
 
 	key.map_up = true;
 	key.count = 1;
 	key.id = id;
 
-	extents = map->nr_extents;
+	return bsearch(&key, map->reverse, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
+}
+
+static u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
 
-	extent = bsearch(&key, map->reverse, extents,
-			 sizeof(struct uid_gid_extent), cmp_map_id);
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		extent = map_id_up_base(extents, map, id);
+	else
+		extent = map_id_up_max(extents, map, id);
+
 	/* Map the id or note failure */
 	if (extent)
 		id = (id - extent->lower_first) + extent->first;
@@ -417,17 +412,6 @@ static u32 map_id_up_max(struct uid_gid_map *map, u32 id)
 	return id;
 }
 
-static u32 map_id_up(struct uid_gid_map *map, u32 id)
-{
-	u32 extents = map->nr_extents;
-	smp_rmb();
-
-	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		return map_id_up_base(map, id);
-
-	return map_id_up_max(map, id);
-}
-
 /**
  *	make_kuid - Map a user-namespace uid pair into a kuid.
  *	@ns:  User namespace that the uid is in
-- 
cgit v1.3-14-g43fede


From d5e7b3c5f51fc6d34e12b6d87bfd30ab277c4625 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 17:09:34 -0500
Subject: userns: Don't read extents twice in m_start

This is important so reading /proc/<pid>/{uid_map,gid_map,projid_map} while
the map is being written does not do strange things.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 563a2981d7c7..4f7e357ac1e2 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -683,11 +683,13 @@ static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
 	loff_t pos = *ppos;
+	unsigned extents = map->nr_extents;
+	smp_rmb();
 
-	if (pos >= map->nr_extents)
+	if (pos >= extents)
 		return NULL;
 
-	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
 		return &map->extent[pos];
 
 	return &map->forward[pos];
-- 
cgit v1.3-14-g43fede


From ece66133979b211324cc6aff9285889b425243d2 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 16:53:09 -0500
Subject: userns: Make map_id_down a wrapper for map_id_range_down

There is no good reason for this code duplication, the number of cache
line accesses not the number of instructions are the bottleneck in
this code.

Therefore simplify maintenance by removing unnecessary code.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4f7e357ac1e2..1d0298870ee3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -313,45 +313,9 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 	return id;
 }
 
-/**
- * map_id_down_base - Find idmap via binary search in static extent array.
- * Can only be called if number of mappings is equal or less than
- * UID_GID_MAP_MAX_BASE_EXTENTS.
- */
-static struct uid_gid_extent *
-map_id_down_base(unsigned extents, struct uid_gid_map *map, u32 id)
-{
-	unsigned idx;
-	u32 first, last;
-
-	/* Find the matching extent */
-	for (idx = 0; idx < extents; idx++) {
-		first = map->extent[idx].first;
-		last = first + map->extent[idx].count - 1;
-		if (id >= first && id <= last)
-			return &map->extent[idx];
-	}
-	return NULL;
-}
-
 static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-	struct uid_gid_extent *extent;
-	unsigned extents = map->nr_extents;
-	smp_rmb();
-
-	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
-		extent = map_id_down_base(extents, map, id);
-	else
-		extent = map_id_range_down_max(extents, map, id, 1);
-
-	/* Map the id or note failure */
-	if (extent)
-		id = (id - extent->first) + extent->lower_first;
-	else
-		id = (u32) -1;
-
-	return id;
+	return map_id_range_down(map, id, 1);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 3fda0e737e906ce73220b20c27e7f792d0aac6a8 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 31 Oct 2017 17:15:30 -0500
Subject: userns: Simplify insert_extent

Consolidate the code to write to the new mapping at the end of the
function to remove the duplication.  Move the increase in the number
of mappings into insert_extent, keeping the logic together.

Just a small increase in readability and maintainability.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/user_namespace.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1d0298870ee3..899c31060ff3 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -758,12 +758,7 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
  */
 static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
 {
-	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) {
-		map->extent[map->nr_extents].first = extent->first;
-		map->extent[map->nr_extents].lower_first = extent->lower_first;
-		map->extent[map->nr_extents].count = extent->count;
-		return 0;
-	}
+	struct uid_gid_extent *dest;
 
 	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
 		struct uid_gid_extent *forward;
@@ -784,9 +779,13 @@ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
 		map->reverse = NULL;
 	}
 
-	map->forward[map->nr_extents].first = extent->first;
-	map->forward[map->nr_extents].lower_first = extent->lower_first;
-	map->forward[map->nr_extents].count = extent->count;
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
+		dest = &map->extent[map->nr_extents];
+	else
+		dest = &map->forward[map->nr_extents];
+
+	*dest = *extent;
+	map->nr_extents++;
 	return 0;
 }
 
@@ -968,8 +967,6 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (ret < 0)
 			goto out;
 		ret = -EINVAL;
-
-		new_map.nr_extents++;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
-- 
cgit v1.3-14-g43fede


From 638f5b90d46016372a8e3e0a434f199cc5e12b8c Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 31 Oct 2017 18:16:05 -0700
Subject: bpf: reduce verifier memory consumption

the verifier got progressively smarter over time and size of its internal
state grew as well. Time to reduce the memory consumption.

Before:
sizeof(struct bpf_verifier_state) = 6520
After:
sizeof(struct bpf_verifier_state) = 896

It's done by observing that majority of BPF programs use little to
no stack whereas verifier kept all of 512 stack slots ready always.
Instead dynamically reallocate struct verifier state when stack
access is detected.
Runtime difference before vs after is within a noise.
The number of processed instructions stays the same.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/verifier.c |   8 +-
 include/linux/bpf_verifier.h                      |  16 +-
 kernel/bpf/verifier.c                             | 437 ++++++++++++++--------
 3 files changed, 305 insertions(+), 156 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
index 3d3dcac1c942..a8c7615546a9 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c
@@ -76,9 +76,9 @@ nfp_bpf_goto_meta(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
 
 static int
 nfp_bpf_check_exit(struct nfp_prog *nfp_prog,
-		   const struct bpf_verifier_env *env)
+		   struct bpf_verifier_env *env)
 {
-	const struct bpf_reg_state *reg0 = &env->cur_state.regs[0];
+	const struct bpf_reg_state *reg0 = cur_regs(env) + BPF_REG_0;
 	u64 imm;
 
 	if (nfp_prog->act == NN_ACT_XDP)
@@ -144,9 +144,9 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog,
 
 static int
 nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta,
-		  const struct bpf_verifier_env *env, u8 reg_no)
+		  struct bpf_verifier_env *env, u8 reg_no)
 {
-	const struct bpf_reg_state *reg = &env->cur_state.regs[reg_no];
+	const struct bpf_reg_state *reg = cur_regs(env) + reg_no;
 	int err;
 
 	if (reg->type != PTR_TO_CTX &&
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index feeaea93d959..3b0976aaac75 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -88,14 +88,19 @@ enum bpf_stack_slot_type {
 
 #define BPF_REG_SIZE 8	/* size of eBPF register in bytes */
 
+struct bpf_stack_state {
+	struct bpf_reg_state spilled_ptr;
+	u8 slot_type[BPF_REG_SIZE];
+};
+
 /* state of the program:
  * type of all registers and stack info
  */
 struct bpf_verifier_state {
 	struct bpf_reg_state regs[MAX_BPF_REG];
-	u8 stack_slot_type[MAX_BPF_STACK];
-	struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
 	struct bpf_verifier_state *parent;
+	int allocated_stack;
+	struct bpf_stack_state *stack;
 };
 
 /* linked list of verifier states used to prune search */
@@ -145,7 +150,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */
 	int stack_size;			/* number of states to be processed */
 	bool strict_alignment;		/* perform strict pointer alignment checks */
-	struct bpf_verifier_state cur_state; /* current verifier state */
+	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
@@ -159,6 +164,11 @@ struct bpf_verifier_env {
 	struct bpf_verifer_log log;
 };
 
+static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
+{
+	return env->cur_state->regs;
+}
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d906775e12c1..5f26f7ad124f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -276,43 +276,132 @@ static void print_verifier_state(struct bpf_verifier_env *env,
 			verbose(env, ")");
 		}
 	}
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(env, " fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
+			verbose(env, " fp%d=%s",
+				-MAX_BPF_STACK + i * BPF_REG_SIZE,
+				reg_type_str[state->stack[i].spilled_ptr.type]);
 	}
 	verbose(env, "\n");
 }
 
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int copy_stack_state(struct bpf_verifier_state *dst,
+			    const struct bpf_verifier_state *src)
 {
-	struct bpf_verifier_stack_elem *elem;
-	int insn_idx;
+	if (!src->stack)
+		return 0;
+	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+		/* internal bug, make state invalid to reject the program */
+		memset(dst, 0, sizeof(*dst));
+		return -EFAULT;
+	}
+	memcpy(dst->stack, src->stack,
+	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+	return 0;
+}
+
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+				  bool copy_old)
+{
+	u32 old_size = state->allocated_stack;
+	struct bpf_stack_state *new_stack;
+	int slot = size / BPF_REG_SIZE;
+
+	if (size <= old_size || !size) {
+		if (copy_old)
+			return 0;
+		state->allocated_stack = slot * BPF_REG_SIZE;
+		if (!size && old_size) {
+			kfree(state->stack);
+			state->stack = NULL;
+		}
+		return 0;
+	}
+	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+				  GFP_KERNEL);
+	if (!new_stack)
+		return -ENOMEM;
+	if (copy_old) {
+		if (state->stack)
+			memcpy(new_stack, state->stack,
+			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+		memset(new_stack + old_size / BPF_REG_SIZE, 0,
+		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+	}
+	state->allocated_stack = slot * BPF_REG_SIZE;
+	kfree(state->stack);
+	state->stack = new_stack;
+	return 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state)
+{
+	kfree(state->stack);
+	kfree(state);
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+			       const struct bpf_verifier_state *src)
+{
+	int err;
+
+	err = realloc_verifier_state(dst, src->allocated_stack, false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+	return copy_stack_state(dst, src);
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+		     int *insn_idx)
+{
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_stack_elem *elem, *head = env->head;
+	int err;
 
 	if (env->head == NULL)
-		return -1;
+		return -ENOENT;
 
-	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
-	insn_idx = env->head->insn_idx;
+	if (cur) {
+		err = copy_verifier_state(cur, &head->st);
+		if (err)
+			return err;
+	}
+	if (insn_idx)
+		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
-		*prev_insn_idx = env->head->prev_insn_idx;
-	elem = env->head->next;
-	kfree(env->head);
+		*prev_insn_idx = head->prev_insn_idx;
+	elem = head->next;
+	kfree(head);
 	env->head = elem;
 	env->stack_size--;
-	return insn_idx;
+	return 0;
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx)
 {
+	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
+	int err;
 
-	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
-	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
@@ -325,7 +414,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	return &elem->st;
 err:
 	/* pop all elements and return */
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	return NULL;
 }
 
@@ -550,7 +639,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state->regs;
 
 	if (regno >= MAX_BPF_REG) {
 		verbose(env, "R%d is invalid\n", regno);
@@ -563,7 +652,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		mark_reg_read(&env->cur_state, regno);
+		mark_reg_read(env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
@@ -601,10 +690,21 @@ static int check_stack_write(struct bpf_verifier_env *env,
 			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+				     true);
+	if (err)
+		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
+	if (!env->allow_ptr_leaks &&
+	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    size != BPF_REG_SIZE) {
+		verbose(env, "attempt to corrupt spilled pointer on stack\n");
+		return -EACCES;
+	}
 
 	if (value_regno >= 0 &&
 	    is_spillable_regtype(state->regs[value_regno].type)) {
@@ -616,17 +716,18 @@ static int check_stack_write(struct bpf_verifier_env *env,
 		}
 
 		/* save register state */
-		state->spilled_regs[spi] = state->regs[value_regno];
-		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+		state->stack[spi].spilled_ptr = state->regs[value_regno];
+		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+			state->stack[spi].slot_type[i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[spi] = (struct bpf_reg_state) {};
+		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+				STACK_MISC;
 	}
 	return 0;
 }
@@ -637,10 +738,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
 			break;
 		/* ... then we depend on parent's value */
-		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 	}
@@ -650,34 +751,37 @@ static int check_stack_read(struct bpf_verifier_env *env,
 			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
-	u8 *slot_type;
-	int i, spi;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+	u8 *stype;
 
-	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+	if (state->allocated_stack <= slot) {
+		verbose(env, "invalid read from stack off %d+0 size %d\n",
+			off, size);
+		return -EACCES;
+	}
+	stype = state->stack[spi].slot_type;
 
-	if (slot_type[0] == STACK_SPILL) {
+	if (stype[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
 			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
-			if (slot_type[i] != STACK_SPILL) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
 				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
 
-		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
 		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = state->spilled_regs[spi];
+			state->regs[value_regno] = state->stack[spi].spilled_ptr;
 			mark_stack_slot_read(state, spi);
 		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (slot_type[i] != STACK_MISC) {
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
 				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
@@ -694,7 +798,8 @@ static int check_stack_read(struct bpf_verifier_env *env,
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 			    int size)
 {
-	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
 
 	if (off < 0 || size <= 0 || off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
@@ -706,9 +811,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-				int off, int size)
+			    int off, int size)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
@@ -783,7 +888,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 				 int off, int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
 	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
@@ -797,7 +902,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			       int size)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	int err;
 
@@ -866,7 +971,7 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
 static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
@@ -968,8 +1073,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
-	struct bpf_reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -993,7 +1099,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -1028,14 +1134,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(env, state->regs, value_regno);
+				mark_reg_unknown(env, regs, value_regno);
 			else
-				mark_reg_known_zero(env, state->regs,
+				mark_reg_known_zero(env, regs,
 						    value_regno);
-			state->regs[value_regno].id = 0;
-			state->regs[value_regno].off = 0;
-			state->regs[value_regno].range = 0;
-			state->regs[value_regno].type = reg_type;
+			regs[value_regno].id = 0;
+			regs[value_regno].off = 0;
+			regs[value_regno].range = 0;
+			regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
@@ -1061,19 +1167,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		if (env->prog->aux->stack_depth < -off)
 			env->prog->aux->stack_depth = -off;
 
-		if (t == BPF_WRITE) {
-			if (!env->allow_ptr_leaks &&
-			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
-			    size != BPF_REG_SIZE) {
-				verbose(env, "attempt to corrupt spilled pointer on stack\n");
-				return -EACCES;
-			}
+		if (t == BPF_WRITE)
 			err = check_stack_write(env, state, off, size,
 						value_regno);
-		} else {
+		else
 			err = check_stack_read(env, state, off, size,
 					       value_regno);
-		}
 	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
 			verbose(env, "cannot write into packet\n");
@@ -1087,7 +1186,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 		err = check_packet_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(env, state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -1095,11 +1194,11 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	}
 
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
-	    state->regs[value_regno].type == SCALAR_VALUE) {
+	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		state->regs[value_regno].var_off = tnum_cast(
-					state->regs[value_regno].var_off, size);
-		__update_reg_bounds(&state->regs[value_regno]);
+		regs[value_regno].var_off =
+			tnum_cast(regs[value_regno].var_off, size);
+		__update_reg_bounds(&regs[value_regno]);
 	}
 	return err;
 }
@@ -1156,9 +1255,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs;
-	int off, i;
+	int off, i, slot, spi;
 
 	if (regs[regno].type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1198,7 +1297,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+		slot = -(off + i) - 1;
+		spi = slot / BPF_REG_SIZE;
+		if (state->allocated_stack <= slot ||
+		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+			STACK_MISC) {
 			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
@@ -1211,7 +1314,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
@@ -1229,7 +1332,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1514,7 +1617,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
@@ -1522,10 +1625,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 		if (reg_is_pkt_pointer_any(&regs[i]))
 			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg_is_pkt_pointer_any(reg))
 			__mark_reg_unknown(reg);
 	}
@@ -1533,9 +1636,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 
 static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -1603,6 +1705,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 			return err;
 	}
 
+	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -1691,7 +1794,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
 	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1703,13 +1806,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env,
 			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
@@ -1890,7 +1993,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state *dst_reg,
 				      struct bpf_reg_state src_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
@@ -2111,7 +2214,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
 	int rc;
@@ -2185,12 +2288,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(env, &env->cur_state);
+		print_verifier_state(env, env->cur_state);
 		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
@@ -2200,7 +2303,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2421,10 +2524,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			/* keep the maximum range already checked */
 			regs[i].range = max(regs[i].range, new_range);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
+		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max_t(u16, reg->range, new_range);
 	}
@@ -2674,17 +2777,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_map_reg(regs, i, id, is_null);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
 	}
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
 	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
@@ -2876,7 +2979,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -2937,7 +3040,7 @@ static bool may_access_skb(enum bpf_prog_type type)
  */
 static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 mode = BPF_MODE(insn->code);
 	int i, err;
 
@@ -2999,7 +3102,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 		return 0;
 	}
 
-	reg = &env->cur_state.regs[BPF_REG_0];
+	reg = cur_regs(env) + BPF_REG_0;
 	if (reg->type != SCALAR_VALUE) {
 		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
 			reg_type_str[reg->type]);
@@ -3363,6 +3466,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
+static bool stacksafe(struct bpf_verifier_state *old,
+		      struct bpf_verifier_state *cur,
+		      struct idpair *idmap)
+{
+	int i, spi;
+
+	/* if explored stack has more populated slots than current stack
+	 * such stacks are not equivalent
+	 */
+	if (old->allocated_stack > cur->allocated_stack)
+		return false;
+
+	/* walk slots of the explored stack and ignore any additional
+	 * slots in the current stack, since explored(safe) state
+	 * didn't use them
+	 */
+	for (i = 0; i < old->allocated_stack; i++) {
+		spi = i / BPF_REG_SIZE;
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			continue;
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
+			return false;
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (old->stack[spi].slot_type[0] != STACK_SPILL)
+			continue;
+		if (!regsafe(&old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr,
+			     idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+			 * but current path has stored:
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+	}
+	return true;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -3407,37 +3561,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			goto out_free;
 	}
 
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (old->stack_slot_type[i] == STACK_INVALID)
-			continue;
-		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
-			 * this verifier states are not equivalent,
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		if (i % BPF_REG_SIZE)
-			continue;
-		if (old->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
-			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     idmap))
-			/* when explored and current stack slot are both storing
-			 * spilled registers, check that stored pointers types
-			 * are the same as well.
-			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
-			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
-			 * such verifier states are not equivalent.
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		else
-			continue;
-	}
+	if (!stacksafe(old, cur, idmap))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -3473,17 +3598,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 		}
 	}
 	/* ... and stack slots */
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
-		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+		    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+		if (parent->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 			continue;
-		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+		if (writes &&
+		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
 			continue;
-		if (state->spilled_regs[i].live & REG_LIVE_READ) {
-			parent->spilled_regs[i].live |= REG_LIVE_READ;
+		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
 			touched = true;
 		}
 	}
@@ -3513,6 +3640,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
+	struct bpf_verifier_state *cur = env->cur_state;
 	int i;
 
 	sl = env->explored_states[insn_idx];
@@ -3523,7 +3651,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state)) {
+		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -3534,7 +3662,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * they'll be immediately forgotten as we're pruning
 			 * this state and will pop a new one.
 			 */
-			propagate_liveness(&sl->state, &env->cur_state);
+			propagate_liveness(&sl->state, cur);
 			return 1;
 		}
 		sl = sl->next;
@@ -3546,16 +3674,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	copy_verifier_state(&new_sl->state, cur);
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	env->cur_state.parent = &new_sl->state;
+	cur->parent = &new_sl->state;
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -3563,10 +3691,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * explored_states can get read marks.)
 	 */
 	for (i = 0; i < BPF_REG_FP; i++)
-		env->cur_state.regs[i].live = REG_LIVE_NONE;
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
-			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+		cur->regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+		if (cur->stack[i].slot_type[0] == STACK_SPILL)
+			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
 	return 0;
 }
 
@@ -3581,15 +3709,19 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 
 static int do_check(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(env, regs);
+	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	env->cur_state = state;
+	init_reg_state(env, state->regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3637,7 +3769,7 @@ static int do_check(struct bpf_verifier_env *env)
 			else
 				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(env, &env->cur_state);
+			print_verifier_state(env, state);
 			do_print_state = false;
 		}
 
@@ -3651,6 +3783,7 @@ static int do_check(struct bpf_verifier_env *env)
 		if (err)
 			return err;
 
+		regs = cur_regs(env);
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -3818,8 +3951,10 @@ static int do_check(struct bpf_verifier_env *env)
 				if (err)
 					return err;
 process_bpf_exit:
-				insn_idx = pop_stack(env, &prev_insn_idx);
-				if (insn_idx < 0) {
+				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				if (err < 0) {
+					if (err != -ENOENT)
+						return err;
 					break;
 				} else {
 					do_print_state = true;
@@ -4359,9 +4494,11 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	if (ret == 0)
@@ -4464,9 +4601,11 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	free_verifier_state(env->cur_state);
+	env->cur_state = NULL;
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	mutex_unlock(&bpf_verifier_lock);
-- 
cgit v1.3-14-g43fede


From 07c41a295c5f25928a7cb689fdec816bd0089fe8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Mon, 30 Oct 2017 13:50:22 -0700
Subject: bpf: avoid rcu_dereference inside bpf_event_mutex lock region

During perf event attaching/detaching bpf programs,
the tp_event->prog_array change is protected by the
bpf_event_mutex lock in both attaching and deteching
functions. Although tp_event->prog_array is a rcu
pointer, rcu_derefrence is not needed to access it
since mutex lock will guarantee ordering.

Verified through "make C=2" that sparse
locking check still happy with the new change.

Also change the label name in perf_event_{attach,detach}_bpf_prog
from "out" to "unlock" to reflect the code action after the label.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 136aa6bb0422..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -769,20 +769,19 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
-		goto out;
+		goto unlock;
 
-	old_array = rcu_dereference_protected(event->tp_event->prog_array,
-					      lockdep_is_held(&bpf_event_mutex));
+	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
 	if (ret < 0)
-		goto out;
+		goto unlock;
 
 	/* set the new array to event->tp_event and set event->prog */
 	event->prog = prog;
 	rcu_assign_pointer(event->tp_event->prog_array, new_array);
 	bpf_prog_array_free(old_array);
 
-out:
+unlock:
 	mutex_unlock(&bpf_event_mutex);
 	return ret;
 }
@@ -796,11 +795,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 	mutex_lock(&bpf_event_mutex);
 
 	if (!event->prog)
-		goto out;
-
-	old_array = rcu_dereference_protected(event->tp_event->prog_array,
-					      lockdep_is_held(&bpf_event_mutex));
+		goto unlock;
 
+	old_array = event->tp_event->prog_array;
 	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
 	if (ret < 0) {
 		bpf_prog_array_delete_safe(old_array, event->prog);
@@ -812,6 +809,6 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
 	bpf_prog_put(event->prog);
 	event->prog = NULL;
 
-out:
+unlock:
 	mutex_unlock(&bpf_event_mutex);
 }
-- 
cgit v1.3-14-g43fede


From 10d94ff4d558b96bfc4f55bb0051ae4d938246fe Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Wed, 1 Nov 2017 10:14:51 +0600
Subject: irq/core: Fix boot crash when the irqaffinity= boot parameter is
 passed on CPUMASK_OFFSTACK=y kernels(v1)

When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y
kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before
initializing the slab allocator to allocate a cpumask.

So, use alloc_bootmem_cpumask_var() instead.

Also do some cleanups while at it: in init_irq_default_affinity() remove
an #ifdef via using cpumask_available().

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mullick@gmail.com
Link: http://lkml.kernel.org/r/20171101041451.12581-1-rakib.mullick@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq/irqdesc.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 982a3576fb01..f2edcf85780d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-	zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+	alloc_bootmem_cpumask_var(&irq_default_affinity);
 	cpulist_parse(str, irq_default_affinity);
 	/*
 	 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (!irq_default_affinity)
+	if (!cpumask_available(irq_default_affinity))
 		zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
-#endif
 	if (cpumask_empty(irq_default_affinity))
 		cpumask_setall(irq_default_affinity);
 }
-- 
cgit v1.3-14-g43fede


From 1969db47f8d0e800397abd4ee4e8d27d2b578587 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 1 Nov 2017 00:08:04 -0700
Subject: bpf: fix verifier memory leaks

fix verifier memory leaks

Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption")
Signed-off-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f26f7ad124f..2bb6d6aa7085 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -341,10 +341,12 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
 	return 0;
 }
 
-static void free_verifier_state(struct bpf_verifier_state *state)
+static void free_verifier_state(struct bpf_verifier_state *state,
+				bool free_self)
 {
 	kfree(state->stack);
-	kfree(state);
+	if (free_self)
+		kfree(state);
 }
 
 /* copy verifier state from src to dst growing dst stack space
@@ -382,6 +384,7 @@ static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
 	if (prev_insn_idx)
 		*prev_insn_idx = head->prev_insn_idx;
 	elem = head->next;
+	free_verifier_state(&head->st, false);
 	kfree(head);
 	env->head = elem;
 	env->stack_size--;
@@ -399,14 +402,14 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 	if (!elem)
 		goto err;
 
-	err = copy_verifier_state(&elem->st, cur);
-	if (err)
-		return NULL;
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
 	env->head = elem;
 	env->stack_size++;
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		goto err;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
 		verbose(env, "BPF program is too complex\n");
 		goto err;
@@ -3641,7 +3644,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
 	struct bpf_verifier_state *cur = env->cur_state;
-	int i;
+	int i, err;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -3679,7 +3682,12 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	copy_verifier_state(&new_sl->state, cur);
+	err = copy_verifier_state(&new_sl->state, cur);
+	if (err) {
+		free_verifier_state(&new_sl->state, false);
+		kfree(new_sl);
+		return err;
+	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
@@ -4424,6 +4432,7 @@ static void free_states(struct bpf_verifier_env *env)
 		if (sl)
 			while (sl != STATE_LIST_MARK) {
 				sln = sl->next;
+				free_verifier_state(&sl->state, false);
 				kfree(sl);
 				sl = sln;
 			}
@@ -4494,7 +4503,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state);
+	free_verifier_state(env->cur_state, true);
 	env->cur_state = NULL;
 
 skip_full_check:
@@ -4601,7 +4610,7 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state);
+	free_verifier_state(env->cur_state, true);
 	env->cur_state = NULL;
 
 skip_full_check:
-- 
cgit v1.3-14-g43fede


From 39c82caff8610d57ffe32157cb3130dfabe12fbe Mon Sep 17 00:00:00 2001
From: Prasad Sodagudi <psodagud@codeaurora.org>
Date: Thu, 26 Oct 2017 11:37:22 -0700
Subject: clockevents: Update clockevents device next_event on stop

clockevent_device::next_event holds the next timer event of a clock event
device. The value is updated in clockevents_program_event(), i.e. when the
hardware timer is armed for the next expiry.

When there are no software timers armed on a CPU, the corresponding per CPU
clockevent device is brought into ONESHOT_STOPPED state, but
clockevent_device::next_event is not updated, because
clockevents_program_event() is not called.

So the content of clockevent_device::next_event is stale, which is not an
issue when real hardware is used. But the hrtimer broadcast device relies
on that information and the stale value causes spurious wakeups.

Update clockevent_device::next_event to KTIME_MAX when it has been brought
into ONESHOT_STOPPED state to avoid spurious wakeups. This reflects the
proper expiry time of the stopped timer: infinity.

[ tglx: Massaged changelog ]

Signed-off-by: Prasad Sodagudi <psodagud@codeaurora.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: viresh.kumar@linaro.org
Link: https://lkml.kernel.org/r/1509043042-32486-1-git-send-email-psodagud@codeaurora.org
---
 kernel/time/tick-oneshot.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 6b009c207671..c1f518e7aa80 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -33,6 +33,7 @@ int tick_program_event(ktime_t expires, int force)
 		 * We don't need the clock event device any more, stop it.
 		 */
 		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+		dev->next_event = KTIME_MAX;
 		return 0;
 	}
 
-- 
cgit v1.3-14-g43fede


From 03c4cc385faaa46e5877f499c6b997fef792f8d3 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Wed, 1 Nov 2017 12:44:45 +0100
Subject: bpf: cpumap micro-optimization in cpu_map_enqueue

Discovered that the compiler laid-out asm code in suboptimal way
when studying perf report during benchmarking of cpumap. Help
the compiler by the marking unlikely code paths.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/cpumap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 86e29cbf7827..ce5b669003b2 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -208,7 +208,7 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
 	headroom = xdp->data - xdp->data_hard_start;
 	metasize = xdp->data - xdp->data_meta;
 	metasize = metasize > 0 ? metasize : 0;
-	if ((headroom - metasize) < sizeof(*xdp_pkt))
+	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
 		return NULL;
 
 	/* Store info in top of packet */
@@ -656,7 +656,7 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 	struct xdp_pkt *xdp_pkt;
 
 	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (!xdp_pkt)
+	if (unlikely(!xdp_pkt))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-- 
cgit v1.3-14-g43fede


From b06723da824af1e979eb1699623881b5f45a783c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 1 Nov 2017 23:58:09 +0100
Subject: bpf: minor cleanups after merge

Two minor cleanups after Dave's recent merge in f8ddadc4db6c
("Merge git://git.kernel.org...") of net into net-next in
order to get the code in line with what was done originally
in the net tree: i) use max() instead of max_t() since both
ranges are u16, ii) don't split the direct access test cases
in the middle with bpf_exit test cases from 390ee7e29fc
("bpf: enforce return code for cgroup-bpf programs").

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c                       |   2 +-
 tools/testing/selftests/bpf/test_verifier.c | 144 ++++++++++++++--------------
 2 files changed, 73 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2bb6d6aa7085..2cc3e9486a1f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2532,7 +2532,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 			continue;
 		reg = &state->stack[i].spilled_ptr;
 		if (reg->type == type && reg->id == dst_reg->id)
-			reg->range = max_t(u16, reg->range, new_range);
+			reg->range = max(reg->range, new_range);
 	}
 }
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 1b93941bdfea..3b38a3d2eebd 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -7249,78 +7249,6 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
-	{
-		"bpf_exit with invalid return code. test1",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x0; 0xffffffff)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test2",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.result = ACCEPT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test3",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x0; 0x3)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test4",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 1),
-			BPF_EXIT_INSN(),
-		},
-		.result = ACCEPT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test5",
-		.insns = {
-			BPF_MOV64_IMM(BPF_REG_0, 2),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has value (0x2; 0x0)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test6",
-		.insns = {
-			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 is not a known value (ctx)",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
-	{
-		"bpf_exit with invalid return code. test7",
-		.insns = {
-			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
-			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
-			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
-			BPF_EXIT_INSN(),
-		},
-		.errstr = "R0 has unknown scalar value",
-		.result = REJECT,
-		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
-	},
 	{
 		"XDP pkt read, pkt_end >= pkt_data', bad access 1",
 		.insns = {
@@ -7470,6 +7398,78 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_XDP,
 		.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
 	},
+	{
+		"bpf_exit with invalid return code. test1",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0xffffffff)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test2",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test3",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x0; 0x3)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test4",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 1),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test5",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_0, 2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has value (0x2; 0x0)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test6",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 is not a known value (ctx)",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
+	{
+		"bpf_exit with invalid return code. test7",
+		.insns = {
+			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+			BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+			BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R0 has unknown scalar value",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
cgit v1.3-14-g43fede


From 5beca081be9195b4316ac5f32921df0234ee8e0e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 1 Nov 2017 23:58:10 +0100
Subject: bpf: also improve pattern matches for meta access

Follow-up to 0fd4759c5515 ("bpf: fix pattern matches for direct
packet access") to cover also the remaining data_meta/data matches
in the verifier. The matches are also refactored a bit to simplify
handling of all the cases.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 165 +++++++++++++++++++++++++++++---------------------
 1 file changed, 96 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2cc3e9486a1f..530b68550fd2 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2787,6 +2787,99 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	}
 }
 
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+				   struct bpf_reg_state *dst_reg,
+				   struct bpf_reg_state *src_reg,
+				   struct bpf_verifier_state *this_branch,
+				   struct bpf_verifier_state *other_branch)
+{
+	if (BPF_SRC(insn->code) != BPF_X)
+		return false;
+
+	switch (BPF_OP(insn->code)) {
+	case BPF_JGT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JGE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	default:
+		return false;
+	}
+
+	return true;
+}
+
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
@@ -2893,75 +2986,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 */
 		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
 		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' > pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end > pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' < pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET,
-				       true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end < pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' >= pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end >= pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' <= pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg,
-				       PTR_TO_PACKET, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end <= pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET_META &&
-		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-		find_good_pkt_pointers(this_branch, dst_reg,
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET_META &&
-		   reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
-		find_good_pkt_pointers(other_branch, dst_reg,
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET_META, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_META) {
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
-				       PTR_TO_PACKET_META, false);
-	} else if (is_pointer_value(env, insn->dst_reg)) {
+	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+					   this_branch, other_branch) &&
+		   is_pointer_value(env, insn->dst_reg)) {
 		verbose(env, "R%d pointer comparison prohibited\n",
 			insn->dst_reg);
 		return -EACCES;
-- 
cgit v1.3-14-g43fede


From 6082a6e44434a17f194048b7d48df56f148ec6d4 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Wed, 1 Nov 2017 11:04:51 -0700
Subject: kernel/time/Kconfig: Fix typo in comment

Fix typo in Kconfig comment text.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Jiri Kosina <trivial@kernel.org>
Link: https://lkml.kernel.org/r/0e586dd4-2b27-864e-c252-bc72df52fd01@infradead.org
---
 kernel/time/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index ac09bc29eb08..d689a9557e17 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -56,7 +56,7 @@ menu "Timers subsystem"
 
 # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
 # only related to the tick functionality. Oneshot clockevent devices
-# are supported independ of this.
+# are supported independent of this.
 config TICK_ONESHOT
 	bool
 
-- 
cgit v1.3-14-g43fede


From fd30b717b86dc30ffe25596f8de6542a02ae9401 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 17:58:54 -0700
Subject: rcu: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 kernel/rcu/rcutorture.c  | 4 ++--
 kernel/rcu/tree_plugin.h | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45f2ffbc1e78..96a3cdaeed91 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1076,7 +1076,7 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp)
  * counter in the element should never be greater than 1, otherwise, the
  * RCU implementation is broken.
  */
-static void rcu_torture_timer(unsigned long unused)
+static void rcu_torture_timer(struct timer_list *unused)
 {
 	int idx;
 	unsigned long started;
@@ -1163,7 +1163,7 @@ rcu_torture_reader(void *arg)
 	VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
 	set_user_nice(current, MAX_NICE);
 	if (irqreader && cur_ops->irq_capable)
-		setup_timer_on_stack(&t, rcu_torture_timer, 0);
+		timer_setup_on_stack(&t, rcu_torture_timer, 0);
 
 	do {
 		if (irqreader && cur_ops->irq_capable) {
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..e85946d9843b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2261,9 +2261,11 @@ static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
 }
 
 /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(unsigned long x)
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
 {
-	do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+	struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+	do_nocb_deferred_wakeup_common(rdp);
 }
 
 /*
@@ -2327,8 +2329,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
 	init_swait_queue_head(&rdp->nocb_wq);
 	rdp->nocb_follower_tail = &rdp->nocb_follower_head;
 	raw_spin_lock_init(&rdp->nocb_lock);
-	setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
-		    (unsigned long)rdp);
+	timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 7cce782ef32f5003c18ab8f9f586f2ed5ce2c33e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:05:51 +0100
Subject: bpf: fix link error without CONFIG_NET

I ran into this link error with the latest net-next plus linux-next
trees when networking is disabled:

kernel/bpf/verifier.o:(.rodata+0x2958): undefined reference to `tc_cls_act_analyzer_ops'
kernel/bpf/verifier.o:(.rodata+0x2970): undefined reference to `xdp_analyzer_ops'

It seems that the code was written to deal with varying contents of
the arrray, but the actual #ifdef was missing. Both tc_cls_act_analyzer_ops
and xdp_analyzer_ops are defined in the core networking code, so adding
a check for CONFIG_NET seems appropriate here, and I've verified this with
many randconfig builds

Fixes: 4f9218aaf8a4 ("bpf: move knowledge about post-translation offsets out of verifier")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 530b68550fd2..5f3799dcba01 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4588,8 +4588,10 @@ err_free_env:
 }
 
 static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
+#ifdef CONFIG_NET
 	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
 	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
+#endif
 };
 
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-- 
cgit v1.3-14-g43fede


From eba0c929d1d0f16c4b03628b7bf8ce363b9e5c9a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 2 Nov 2017 12:05:52 +0100
Subject: bpf: fix out-of-bounds access warning in bpf_check

The bpf_verifer_ops array is generated dynamically and may be
empty depending on configuration, which then causes an out
of bounds access:

kernel/bpf/verifier.c: In function 'bpf_check':
kernel/bpf/verifier.c:4320:29: error: array subscript is above array bounds [-Werror=array-bounds]

This adds a check to the start of the function as a workaround.
I would assume that the function is never called in that configuration,
so the warning is probably harmless.

Fixes: 00176a34d9e2 ("bpf: remove the verifier ops from program structure")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5f3799dcba01..ab5aa5497666 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4474,6 +4474,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
+	/* no program is valid */
+	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+		return -EINVAL;
+
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
 	 * allocate/free it every time bpf_check() is called
 	 */
-- 
cgit v1.3-14-g43fede


From 8c01c4f896aa3404af948880dcb29a2d51c833dc Mon Sep 17 00:00:00 2001
From: Craig Gallek <kraig@google.com>
Date: Thu, 2 Nov 2017 11:18:01 -0400
Subject: bpf: fix verifier NULL pointer dereference

do_check() can fail early without allocating env->cur_state under
memory pressure.  Syzkaller found the stack below on the linux-next
tree because of this.

  kasan: CONFIG_KASAN_INLINE enabled
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN
  Dumping ftrace buffer:
     (ftrace buffer empty)
  Modules linked in:
  CPU: 1 PID: 27062 Comm: syz-executor5 Not tainted 4.14.0-rc7+ #106
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  task: ffff8801c2c74700 task.stack: ffff8801c3e28000
  RIP: 0010:free_verifier_state kernel/bpf/verifier.c:347 [inline]
  RIP: 0010:bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533
  RSP: 0018:ffff8801c3e2f5c8 EFLAGS: 00010202
  RAX: dffffc0000000000 RBX: 00000000fffffff4 RCX: 0000000000000000
  RDX: 0000000000000070 RSI: ffffffff817d5aa9 RDI: 0000000000000380
  RBP: ffff8801c3e2f668 R08: 0000000000000000 R09: 1ffff100387c5d9f
  R10: 00000000218c4e80 R11: ffffffff85b34380 R12: ffff8801c4dc6a28
  R13: 0000000000000000 R14: ffff8801c4dc6a00 R15: ffff8801c4dc6a20
  FS:  00007f311079b700(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00000000004d4a24 CR3: 00000001cbcd0000 CR4: 00000000001406e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   bpf_prog_load+0xcbb/0x18e0 kernel/bpf/syscall.c:1166
   SYSC_bpf kernel/bpf/syscall.c:1690 [inline]
   SyS_bpf+0xae9/0x4620 kernel/bpf/syscall.c:1652
   entry_SYSCALL_64_fastpath+0x1f/0xbe
  RIP: 0033:0x452869
  RSP: 002b:00007f311079abe8 EFLAGS: 00000212 ORIG_RAX: 0000000000000141
  RAX: ffffffffffffffda RBX: 0000000000758020 RCX: 0000000000452869
  RDX: 0000000000000030 RSI: 0000000020168000 RDI: 0000000000000005
  RBP: 00007f311079aa20 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000212 R12: 00000000004b7550
  R13: 00007f311079ab58 R14: 00000000004b7560 R15: 0000000000000000
  Code: df 48 c1 ea 03 80 3c 02 00 0f 85 e6 0b 00 00 4d 8b 6e 20 48 b8 00 00 00 00 00 fc ff df 49 8d bd 80 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 b6 0b 00 00 49 8b bd 80 03 00 00 e8 d6 0c 26
  RIP: free_verifier_state kernel/bpf/verifier.c:347 [inline] RSP: ffff8801c3e2f5c8
  RIP: bpf_check+0xcf4/0x19c0 kernel/bpf/verifier.c:4533 RSP: ffff8801c3e2f5c8
  ---[ end trace c8d37f339dc64004 ]---

Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption")
Fixes: 1969db47f8d0 ("bpf: fix verifier memory leaks")
Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ab5aa5497666..04357ad5a812 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4534,8 +4534,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state, true);
-	env->cur_state = NULL;
+	if (env->cur_state) {
+		free_verifier_state(env->cur_state, true);
+		env->cur_state = NULL;
+	}
 
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
@@ -4643,8 +4645,10 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
-	free_verifier_state(env->cur_state, true);
-	env->cur_state = NULL;
+	if (env->cur_state) {
+		free_verifier_state(env->cur_state, true);
+		env->cur_state = NULL;
+	}
 
 skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
-- 
cgit v1.3-14-g43fede


From edbfd9112f70c34b2965580a67dad5fb306fb6c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 3 Nov 2017 07:02:15 -0700
Subject: Revert "workqueue: respect isolated cpus when queueing an unbound
 work"

This reverts commit b5149873a0c299195b5346fe4dc2c5b04ae2f995.

It conflicts with the following isolcpus change from the sched branch.

 edb9382175c3 ("sched/isolation: Move isolcpus= handling to the housekeeping code")

Let's revert for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bfa433b38a61..64d0edf428f8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4980,10 +4980,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
 	if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
 		return -ENOMEM;
 
-	/*
-	 * Not excluding isolated cpus on purpose.
-	 * If the user wishes to include them, we allow that.
-	 */
 	cpumask_and(cpumask, cpumask, cpu_possible_mask);
 	if (!cpumask_empty(cpumask)) {
 		apply_wqattrs_lock();
@@ -5583,7 +5579,7 @@ int __init workqueue_init_early(void)
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_andnot(wq_unbound_cpumask, cpu_possible_mask, cpu_isolated_map);
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
-- 
cgit v1.3-14-g43fede


From 2d2123bc7c7f843aa9db87720de159a049839862 Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin@arm.com>
Date: Tue, 31 Oct 2017 15:51:14 +0000
Subject: arm64/sve: Add prctl controls for userspace vector length management
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds two arm64-specific prctls, to permit userspace to
control its vector length:

 * PR_SVE_SET_VL: set the thread's SVE vector length and vector
   length inheritance mode.

 * PR_SVE_GET_VL: get the same information.

Although these prctls resemble instruction set features in the SVE
architecture, they provide additional control: the vector length
inheritance mode is Linux-specific and nothing to do with the
architecture, and the architecture does not permit EL0 to set its
own vector length directly.  Both can be used in portable tools
without requiring the use of SVE instructions.

Signed-off-by: Dave Martin <Dave.Martin@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alex Bennée <alex.bennee@linaro.org>
[will: Fixed up prctl constants to avoid clash with PDEATHSIG]
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/fpsimd.h    | 14 +++++++++++
 arch/arm64/include/asm/processor.h |  4 +++
 arch/arm64/kernel/fpsimd.c         | 50 ++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/prctl.h         |  4 +++
 kernel/sys.c                       | 12 +++++++++
 5 files changed, 84 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index d754e5a6949c..b868412c815c 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -17,6 +17,7 @@
 #define __ASM_FP_H
 
 #include <asm/ptrace.h>
+#include <asm/errno.h>
 
 #ifndef __ASSEMBLY__
 
@@ -98,6 +99,9 @@ extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 extern int sve_set_vector_length(struct task_struct *task,
 				 unsigned long vl, unsigned long flags);
 
+extern int sve_set_current_vl(unsigned long arg);
+extern int sve_get_current_vl(void);
+
 /*
  * Probing and setup functions.
  * Calls to these functions must be serialised with one another.
@@ -114,6 +118,16 @@ static inline void fpsimd_release_task(struct task_struct *task) { }
 static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
 static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }
 
+static inline int sve_set_current_vl(unsigned long arg)
+{
+	return -EINVAL;
+}
+
+static inline int sve_get_current_vl(void)
+{
+	return -EINVAL;
+}
+
 static inline void sve_init_vq_map(void) { }
 static inline void sve_update_vq_map(void) { }
 static inline int sve_verify_vq_map(void) { return 0; }
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c6fddb005dc2..023cacb946c3 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -217,5 +217,9 @@ static inline void spin_lock_prefetch(const void *ptr)
 int cpu_enable_pan(void *__unused);
 int cpu_enable_cache_maint_trap(void *__unused);
 
+/* Userspace interface for PR_SVE_{SET,GET}_VL prctl()s: */
+#define SVE_SET_VL(arg)	sve_set_current_vl(arg)
+#define SVE_GET_VL()	sve_get_current_vl()
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASM_PROCESSOR_H */
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index b82d44693b9d..fd3cfdd7f9be 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -29,6 +29,7 @@
 #include <linux/irqflags.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/prctl.h>
 #include <linux/preempt.h>
 #include <linux/prctl.h>
 #include <linux/ptrace.h>
@@ -558,6 +559,55 @@ out:
 	return 0;
 }
 
+/*
+ * Encode the current vector length and flags for return.
+ * This is only required for prctl(): ptrace has separate fields
+ *
+ * flags are as for sve_set_vector_length().
+ */
+static int sve_prctl_status(unsigned long flags)
+{
+	int ret;
+
+	if (flags & PR_SVE_SET_VL_ONEXEC)
+		ret = current->thread.sve_vl_onexec;
+	else
+		ret = current->thread.sve_vl;
+
+	if (test_thread_flag(TIF_SVE_VL_INHERIT))
+		ret |= PR_SVE_VL_INHERIT;
+
+	return ret;
+}
+
+/* PR_SVE_SET_VL */
+int sve_set_current_vl(unsigned long arg)
+{
+	unsigned long vl, flags;
+	int ret;
+
+	vl = arg & PR_SVE_VL_LEN_MASK;
+	flags = arg & ~vl;
+
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	ret = sve_set_vector_length(current, vl, flags);
+	if (ret)
+		return ret;
+
+	return sve_prctl_status(flags);
+}
+
+/* PR_SVE_GET_VL */
+int sve_get_current_vl(void)
+{
+	if (!system_supports_sve())
+		return -EINVAL;
+
+	return sve_prctl_status(0);
+}
+
 /*
  * Bitmap for temporary storage of the per-CPU set of supported vector lengths
  * during secondary boot.
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1b64901ca6b3..f60db5db6e8e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,7 +198,11 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
 /* arm64 Scalable Vector Extension controls */
+/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
+#define PR_SVE_SET_VL			50	/* set task vector length */
 # define PR_SVE_SET_VL_ONEXEC		(1 << 18) /* defer effect until exec */
+#define PR_SVE_GET_VL			51	/* get task vector length */
+/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
 # define PR_SVE_VL_LEN_MASK		0xffff
 # define PR_SVE_VL_INHERIT		(1 << 17) /* inherit across exec */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 9aebc2935013..c541916b38c6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -110,6 +110,12 @@
 #ifndef SET_FP_MODE
 # define SET_FP_MODE(a,b)	(-EINVAL)
 #endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL()		(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2385,6 +2391,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SVE_SET_VL:
+		error = SVE_SET_VL(arg2);
+		break;
+	case PR_SVE_GET_VL:
+		error = SVE_GET_VL();
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From d62d813c0d714a2d0aaf3d796a7a51ae60bf5470 Mon Sep 17 00:00:00 2001
From: Chris Redpath <chris.redpath@arm.com>
Date: Fri, 3 Nov 2017 13:36:42 +0000
Subject: cpufreq: schedutil: Examine the correct CPU when we update util

After commit 674e75411fc2 (sched: cpufreq: Allow remote cpufreq
callbacks) we stopped to always read the utilization for the CPU we
are running the governor on, and instead we read it for the CPU
which we've been told has updated utilization.  This is stored in
sugov_cpu->cpu.

The value is set in sugov_register() but we clear it in sugov_start()
which leads to always looking at the utilization of CPU0 instead of
the correct one.

Fix this by consolidating the initialization code into sugov_start().

Fixes: 674e75411fc2 (sched: cpufreq: Allow remote cpufreq callbacks)
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Reviewed-by: Patrick Bellasi <patrick.bellasi@arm.com>
Reviewed-by: Brendan Jackman <brendan.jackman@arm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9209d83ecdcf..ba0da243fdd8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -649,6 +649,7 @@ static int sugov_start(struct cpufreq_policy *policy)
 		struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
 
 		memset(sg_cpu, 0, sizeof(*sg_cpu));
+		sg_cpu->cpu = cpu;
 		sg_cpu->sg_policy = sg_policy;
 		sg_cpu->flags = SCHED_CPUFREQ_RT;
 		sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
@@ -714,11 +715,6 @@ struct cpufreq_governor *cpufreq_default_governor(void)
 
 static int __init sugov_register(void)
 {
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		per_cpu(sugov_cpu, cpu).cpu = cpu;
-
 	return cpufreq_register_governor(&schedutil_gov);
 }
 fs_initcall(sugov_register);
-- 
cgit v1.3-14-g43fede


From ab3f0063c48c26c927851b6767824e35a716d878 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:17 -0700
Subject: bpf: offload: add infrastructure for loading programs for a specific
 netdev

The fact that we don't know which device the program is going
to be used on is quite limiting in current eBPF infrastructure.
We have to reverse or limit the changes which kernel makes to
the loaded bytecode if we want it to be offloaded to a networking
device.  We also have to invent new APIs for debugging and
troubleshooting support.

Make it possible to load programs for a specific netdev.  This
helps us to bring the debug information closer to the core
eBPF infrastructure (e.g. we will be able to reuse the verifer
log in device JIT).  It allows device JITs to perform translation
on the original bytecode.

__bpf_prog_get() when called to get a reference for an attachment
point will now refuse to give it if program has a device assigned.
Following patches will add a version of that function which passes
the expected netdev in. @type argument in __bpf_prog_get() is
renamed to attach_type to make it clearer that it's only set on
attachment.

All calls to ndo_bpf are protected by rtnl, only verifier callbacks
are not.  We need a wait queue to make sure netdev doesn't get
destroyed while verifier is still running and calling its driver.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h          |  36 +++++++++
 include/linux/bpf_verifier.h |  10 +++
 include/linux/netdevice.h    |  14 ++++
 include/uapi/linux/bpf.h     |   1 +
 kernel/bpf/Makefile          |   1 +
 kernel/bpf/core.c            |  10 ++-
 kernel/bpf/offload.c         | 182 +++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c         |  17 +++-
 kernel/bpf/verifier.c        |  15 +++-
 9 files changed, 278 insertions(+), 8 deletions(-)
 create mode 100644 kernel/bpf/offload.c

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 520aeebe0d93..e45d43f9ec92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/rbtree_latch.h>
 #include <linux/numa.h>
+#include <linux/wait.h>
 
 struct perf_event;
 struct bpf_prog;
@@ -182,6 +183,16 @@ struct bpf_verifier_ops {
 				  struct bpf_prog *prog, u32 *target_size);
 };
 
+struct bpf_dev_offload {
+	struct bpf_prog		*prog;
+	struct net_device	*netdev;
+	void			*dev_priv;
+	struct list_head	offloads;
+	bool			dev_state;
+	bool			verifier_running;
+	wait_queue_head_t	verifier_done;
+};
+
 struct bpf_prog_aux {
 	atomic_t refcnt;
 	u32 used_map_cnt;
@@ -199,6 +210,7 @@ struct bpf_prog_aux {
 #ifdef CONFIG_SECURITY
 	void *security;
 #endif
+	struct bpf_dev_offload *offload;
 	union {
 		struct work_struct work;
 		struct rcu_head	rcu;
@@ -317,6 +329,7 @@ extern const struct file_operations bpf_prog_fops;
 #undef BPF_PROG_TYPE
 #undef BPF_MAP_TYPE
 
+extern const struct bpf_prog_ops bpf_offload_prog_ops;
 extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
@@ -491,6 +504,29 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+int bpf_prog_offload_compile(struct bpf_prog *prog);
+void bpf_prog_offload_destroy(struct bpf_prog *prog);
+
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return aux->offload;
+}
+#else
+static inline int bpf_prog_offload_init(struct bpf_prog *prog,
+					union bpf_attr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux)
+{
+	return false;
+}
+#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
+
 #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL)
 struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
 int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 3b0976aaac75..e45011dbc02d 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -153,6 +153,7 @@ struct bpf_verifier_env {
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
 	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
+	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
 	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
@@ -169,6 +170,15 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env)
 	return env->cur_state->regs;
 }
 
+#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env);
+#else
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	return -EOPNOTSUPP;
+}
+#endif
+
 int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
 		 void *priv);
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9af9feaaeb64..fda527ccb263 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -797,8 +797,13 @@ enum bpf_netdev_command {
 	 * is equivalent to XDP_ATTACHED_DRV.
 	 */
 	XDP_QUERY_PROG,
+	/* BPF program for offload callbacks, invoked at program load time. */
+	BPF_OFFLOAD_VERIFIER_PREP,
+	BPF_OFFLOAD_TRANSLATE,
+	BPF_OFFLOAD_DESTROY,
 };
 
+struct bpf_ext_analyzer_ops;
 struct netlink_ext_ack;
 
 struct netdev_bpf {
@@ -815,6 +820,15 @@ struct netdev_bpf {
 			u8 prog_attached;
 			u32 prog_id;
 		};
+		/* BPF_OFFLOAD_VERIFIER_PREP */
+		struct {
+			struct bpf_prog *prog;
+			const struct bpf_ext_analyzer_ops *ops; /* callee set */
+		} verifier;
+		/* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */
+		struct {
+			struct bpf_prog *prog;
+		} offload;
 	};
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index a9820677c2ff..80d191a93fb0 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -260,6 +260,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
+		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 16e95c8e749e..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
 obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7fe448799d76..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1380,7 +1380,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1549,6 +1555,8 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..5553e0e2f8b1
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,182 @@
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 323be2473c4b..1574b9f0f24e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -824,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -1054,7 +1057,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1062,7 +1065,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1089,7 +1092,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_name
+#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1152,6 +1155,12 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_target_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 04357ad5a812..51aabb32ad67 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3736,10 +3736,13 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
+		return env->analyzer_ops->insn_hook(env, insn_idx,
+						    prev_insn_idx);
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
@@ -4516,6 +4519,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
cgit v1.3-14-g43fede


From bd601b6ada11fdfb9e277f24ad2eb54bc599156b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:18 -0700
Subject: bpf: report offload info to user space

Extend struct bpf_prog_info to contain information about program
being bound to a device.  Since the netdev may get destroyed while
program still exists we need a flag to indicate the program is
loaded for a device, even if the device is gone.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |  1 +
 include/uapi/linux/bpf.h |  6 ++++++
 kernel/bpf/offload.c     | 12 ++++++++++++
 kernel/bpf/syscall.c     |  5 +++++
 4 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e45d43f9ec92..98bacd0fa5cc 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -506,6 +506,7 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d191a93fb0..4455dd195201 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -895,6 +895,10 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
+enum bpf_prog_status {
+	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
+};
+
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -908,6 +912,8 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
+	__u32 ifindex;
+	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 5553e0e2f8b1..2816feb38be1 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -144,6 +144,18 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
+u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	u32 ifindex;
+
+	rtnl_lock();
+	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
+	rtnl_unlock();
+
+	return ifindex;
+}
+
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1574b9f0f24e..3217c20ea91b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1592,6 +1592,11 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
+	if (bpf_prog_is_dev_bound(prog->aux)) {
+		info.status |= BPF_PROG_STATUS_DEV_BOUND;
+		info.ifindex = bpf_prog_offload_ifindex(prog);
+	}
+
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.3-14-g43fede


From 248f346ffe9508dee0039db4ac839cb31ba3bdec Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:20 -0700
Subject: xdp: allow attaching programs loaded for specific device

Pass the netdev pointer to bpf_prog_get_type().  This way
BPF code can decide whether the device matches what the
code was loaded/translated for.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h  | 10 ++++++++++
 kernel/bpf/syscall.c | 33 +++++++++++++++++++++++++++++----
 net/core/dev.c       |  6 +++++-
 3 files changed, 44 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 98bacd0fa5cc..c397934f91dd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -335,6 +335,8 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -428,6 +430,14 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
+						     enum bpf_prog_type type,
+						     struct net_device *netdev)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog,
 							  int i)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 3217c20ea91b..68f9123acd39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,7 +1057,22 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
+static bool bpf_prog_can_attach(struct bpf_prog *prog,
+				enum bpf_prog_type *attach_type,
+				struct net_device *netdev)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	if (prog->type != *attach_type)
+		return false;
+	if (offload && offload->netdev != netdev)
+		return false;
+
+	return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+				       struct net_device *netdev)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1065,7 +1080,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && (prog->type != *attach_type || prog->aux->offload)) {
+	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1078,12 +1093,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL);
+	return __bpf_prog_get(ufd, NULL, NULL);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1091,6 +1106,16 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       struct net_device *netdev)
+{
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+
+	if (!IS_ERR(prog))
+		trace_bpf_prog_get_type(prog);
+	return prog;
+}
+
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 10cde58d3275..30b5fe32c525 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7157,7 +7157,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		if (bpf_op == ops->ndo_bpf)
+			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+						     dev);
+		else
+			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
-- 
cgit v1.3-14-g43fede


From 6c8dfe21c435cf2953e3cee43e12180cbc4f0820 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:21 -0700
Subject: cls_bpf: allow attaching programs loaded for specific device

If TC program is loaded with skip_sw flag, we should allow
the device-specific programs to be accepted.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Simon Horman <simon.horman@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c |  1 +
 net/sched/cls_bpf.c  | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 68f9123acd39..416d70cdfc76 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1115,6 +1115,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 		trace_bpf_prog_get_type(prog);
 	return prog;
 }
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
 #define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index bc3edde1b9d7..dc9bd9a0070b 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -374,7 +374,7 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
 }
 
 static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
-				 const struct tcf_proto *tp)
+				 u32 gen_flags, const struct tcf_proto *tp)
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
@@ -382,7 +382,11 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
 
-	fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
+	if (gen_flags & TCA_CLS_FLAGS_SKIP_SW)
+		fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS,
+					   qdisc_dev(tp->q));
+	else
+		fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
@@ -440,7 +444,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp,
 	prog->gen_flags = gen_flags;
 
 	ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) :
-		       cls_bpf_prog_from_efd(tb, prog, tp);
+		       cls_bpf_prog_from_efd(tb, prog, gen_flags, tp);
 	if (ret < 0)
 		return ret;
 
-- 
cgit v1.3-14-g43fede


From b37a530613104aa3f592376c67a462823298759c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri, 3 Nov 2017 13:56:30 -0700
Subject: bpf: remove old offload/analyzer

Thanks to the ability to load a program for a specific device,
running verifier twice is no longer needed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf_verifier.h |  5 ---
 kernel/bpf/verifier.c        | 75 --------------------------------------------
 net/core/filter.c            | 42 -------------------------
 3 files changed, 122 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index e45011dbc02d..07b96aaca256 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -152,9 +152,7 @@ struct bpf_verifier_env {
 	bool strict_alignment;		/* perform strict pointer alignment checks */
 	struct bpf_verifier_state *cur_state; /* current verifier state */
 	struct bpf_verifier_state_list **explored_states; /* search pruning optimization */
-	const struct bpf_ext_analyzer_ops *analyzer_ops; /* external analyzer ops */
 	const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */
-	void *analyzer_priv; /* pointer to external analyzer's private data */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
 	u32 id_gen;			/* used to generate unique reg IDs */
@@ -179,7 +177,4 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
 }
 #endif
 
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv);
-
 #endif /* _LINUX_BPF_VERIFIER_H */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 51aabb32ad67..add845fe788a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -949,9 +949,6 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 */
 		*reg_type = info.reg_type;
 
-		if (env->analyzer_ops)
-			return 0;
-
 		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
@@ -3736,9 +3733,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (env->analyzer_ops && env->analyzer_ops->insn_hook)
-		return env->analyzer_ops->insn_hook(env, insn_idx,
-						    prev_insn_idx);
 	if (env->dev_ops && env->dev_ops->insn_hook)
 		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
@@ -4601,72 +4595,3 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
-
-static const struct bpf_verifier_ops * const bpf_analyzer_ops[] = {
-#ifdef CONFIG_NET
-	[BPF_PROG_TYPE_XDP]		= &xdp_analyzer_ops,
-	[BPF_PROG_TYPE_SCHED_CLS]	= &tc_cls_act_analyzer_ops,
-#endif
-};
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv)
-{
-	struct bpf_verifier_env *env;
-	int ret;
-
-	if (prog->type >= ARRAY_SIZE(bpf_analyzer_ops) ||
-	    !bpf_analyzer_ops[prog->type])
-		return -EOPNOTSUPP;
-
-	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
-	if (!env)
-		return -ENOMEM;
-
-	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
-				     prog->len);
-	ret = -ENOMEM;
-	if (!env->insn_aux_data)
-		goto err_free_env;
-	env->prog = prog;
-	env->ops = bpf_analyzer_ops[env->prog->type];
-	env->analyzer_ops = ops;
-	env->analyzer_priv = priv;
-
-	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
-
-	env->strict_alignment = false;
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		env->strict_alignment = true;
-
-	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct bpf_verifier_state_list *),
-				       GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!env->explored_states)
-		goto skip_full_check;
-
-	ret = check_cfg(env);
-	if (ret < 0)
-		goto skip_full_check;
-
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
-	ret = do_check(env);
-	if (env->cur_state) {
-		free_verifier_state(env->cur_state, true);
-		env->cur_state = NULL;
-	}
-
-skip_full_check:
-	while (!pop_stack(env, NULL, NULL));
-	free_states(env);
-
-	mutex_unlock(&bpf_verifier_lock);
-	vfree(env->insn_aux_data);
-err_free_env:
-	kfree(env);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/net/core/filter.c b/net/core/filter.c
index a0112168d6f9..1afa17935954 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3777,25 +3777,6 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return bpf_skb_is_valid_access(off, size, type, info);
 }
 
-static bool
-tc_cls_act_is_valid_access_analyzer(int off, int size,
-				    enum bpf_access_type type,
-				    struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct sk_buff, len):
-		return true;
-	case offsetof(struct sk_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct sk_buff, cb) +
-	     offsetof(struct bpf_skb_data_end, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 static bool __is_valid_xdp_access(int off, int size)
 {
 	if (off < 0 || off >= sizeof(struct xdp_md))
@@ -3830,21 +3811,6 @@ static bool xdp_is_valid_access(int off, int size,
 	return __is_valid_xdp_access(off, size);
 }
 
-static bool xdp_is_valid_access_analyzer(int off, int size,
-					 enum bpf_access_type type,
-					 struct bpf_insn_access_aux *info)
-{
-	switch (off) {
-	case offsetof(struct xdp_buff, data):
-		info->reg_type = PTR_TO_PACKET;
-		return true;
-	case offsetof(struct xdp_buff, data_end):
-		info->reg_type = PTR_TO_PACKET_END;
-		return true;
-	}
-	return false;
-}
-
 void bpf_warn_invalid_xdp_action(u32 act)
 {
 	const u32 act_max = XDP_REDIRECT;
@@ -4516,10 +4482,6 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.gen_prologue		= tc_cls_act_prologue,
 };
 
-const struct bpf_verifier_ops tc_cls_act_analyzer_ops = {
-	.is_valid_access	= tc_cls_act_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops tc_cls_act_prog_ops = {
 	.test_run		= bpf_prog_test_run_skb,
 };
@@ -4530,10 +4492,6 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
 	.convert_ctx_access	= xdp_convert_ctx_access,
 };
 
-const struct bpf_verifier_ops xdp_analyzer_ops = {
-	.is_valid_access	= xdp_is_valid_access_analyzer,
-};
-
 const struct bpf_prog_ops xdp_prog_ops = {
 	.test_run		= bpf_prog_test_run_xdp,
 };
-- 
cgit v1.3-14-g43fede


From ebc614f687369f9df99828572b1d85a7c2de3d92 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Sun, 5 Nov 2017 08:15:32 -0500
Subject: bpf, cgroup: implement eBPF-based device controller for cgroup v2

Cgroup v2 lacks the device controller, provided by cgroup v1.
This patch adds a new eBPF program type, which in combination
of previously added ability to attach multiple eBPF programs
to a cgroup, will provide a similar functionality, but with some
additional flexibility.

This patch introduces a BPF_PROG_TYPE_CGROUP_DEVICE program type.
A program takes major and minor device numbers, device type
(block/character) and access type (mknod/read/write) as parameters
and returns an integer which defines if the operation should be
allowed or terminated with -EPERM.

Signed-off-by: Roman Gushchin <guro@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf-cgroup.h     | 15 ++++++++++
 include/linux/bpf_types.h      |  3 ++
 include/linux/device_cgroup.h  |  8 ++++-
 include/uapi/linux/bpf.h       | 15 ++++++++++
 kernel/bpf/cgroup.c            | 67 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c           |  7 +++++
 kernel/bpf/verifier.c          |  1 +
 tools/include/uapi/linux/bpf.h | 15 ++++++++++
 8 files changed, 130 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 87a7db9feb38..a7f16e0f8d68 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -67,6 +67,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     struct bpf_sock_ops_kern *sock_ops,
 				     enum bpf_attach_type type);
 
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type);
+
 /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
 #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
 ({									      \
@@ -112,6 +115,17 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	}								       \
 	__ret;								       \
 })
+
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access)	      \
+({									      \
+	int __ret = 0;							      \
+	if (cgroup_bpf_enabled)						      \
+		__ret = __cgroup_bpf_check_dev_permission(type, major, minor, \
+							  access,	      \
+							  BPF_CGROUP_DEVICE); \
+									      \
+	__ret;								      \
+})
 #else
 
 struct cgroup_bpf {};
@@ -122,6 +136,7 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
 
 #endif /* CONFIG_CGROUP_BPF */
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 53c5b9ad7220..978c1d9c9383 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -19,6 +19,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
 BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint)
 BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event)
 #endif
+#ifdef CONFIG_CGROUP_BPF
+BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev)
+#endif
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/device_cgroup.h b/include/linux/device_cgroup.h
index 2d93d7ecd479..8557efe096dc 100644
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -1,5 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/fs.h>
+#include <linux/bpf-cgroup.h>
 
 #define DEVCG_ACC_MKNOD 1
 #define DEVCG_ACC_READ  2
@@ -19,10 +20,15 @@ static inline int __devcgroup_check_permission(short type, u32 major, u32 minor,
 { return 0; }
 #endif
 
-#ifdef CONFIG_CGROUP_DEVICE
+#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
 static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
 					     short access)
 {
+	int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);
+
+	if (rc)
+		return -EPERM;
+
 	return __devcgroup_check_permission(type, major, minor, access);
 }
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4455dd195201..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -132,6 +132,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -141,6 +142,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -991,4 +993,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 3db5a17fcfe8..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -522,3 +522,70 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
+
+	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
+
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
+
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
+
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
+}
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 416d70cdfc76..09badc37e864 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1326,6 +1326,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1378,6 +1381,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, false);
@@ -1420,6 +1426,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
 	case BPF_CGROUP_INET_EGRESS:
 	case BPF_CGROUP_INET_SOCK_CREATE:
 	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
 		break;
 	default:
 		return -EINVAL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index add845fe788a..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3124,6 +3124,7 @@ static int check_return_code(struct bpf_verifier_env *env)
 	case BPF_PROG_TYPE_CGROUP_SKB:
 	case BPF_PROG_TYPE_CGROUP_SOCK:
 	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
 		break;
 	default:
 		return 0;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e92f62cf933a..b280f37cd057 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -131,6 +131,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_XMIT,
 	BPF_PROG_TYPE_SOCK_OPS,
 	BPF_PROG_TYPE_SK_SKB,
+	BPF_PROG_TYPE_CGROUP_DEVICE,
 };
 
 enum bpf_attach_type {
@@ -140,6 +141,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_SOCK_OPS,
 	BPF_SK_SKB_STREAM_PARSER,
 	BPF_SK_SKB_STREAM_VERDICT,
+	BPF_CGROUP_DEVICE,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -990,4 +992,17 @@ struct bpf_perf_event_value {
 	__u64 running;
 };
 
+#define BPF_DEVCG_ACC_MKNOD	(1ULL << 0)
+#define BPF_DEVCG_ACC_READ	(1ULL << 1)
+#define BPF_DEVCG_ACC_WRITE	(1ULL << 2)
+
+#define BPF_DEVCG_DEV_BLOCK	(1ULL << 0)
+#define BPF_DEVCG_DEV_CHAR	(1ULL << 1)
+
+struct bpf_cgroup_dev_ctx {
+	__u32 access_type; /* (access << 16) | type */
+	__u32 major;
+	__u32 minor;
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
cgit v1.3-14-g43fede


From 9a19b463863e757e649c37af245b6af101410c1e Mon Sep 17 00:00:00 2001
From: Wang Long <wanglong19@meituan.com>
Date: Thu, 2 Nov 2017 23:05:12 -0400
Subject: workqueue: Fix comment for unbound workqueue's attrbutes

Signed-off-by: Wang Long <wanglong19@meituan.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 64d0edf428f8..5f99851bff09 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5013,9 +5013,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
  *
  * Unbound workqueues have the following extra attributes.
  *
- *  id		RO int	: the associated pool ID
+ *  pool_ids	RO int	: the associated pool IDs for each node
  *  nice	RW int	: nice value of the workers
  *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ *  numa	RW bool	: whether enable NUMA affinity
  */
 struct wq_device {
 	struct workqueue_struct		*wq;
-- 
cgit v1.3-14-g43fede


From 01ee6cfb1483fe57c9cbd8e73817dfbf9bacffd3 Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 6 Nov 2017 13:30:28 -0500
Subject: cgroup: export list of delegatable control files using sysfs

Delegatable cgroup v2 control files may require special handling
(e.g. chowning), and the exact list of such files varies between
kernel versions (and likely to be extended in the future).

To guarantee correctness of this list and simplify the life
of userspace (systemd, first of all), let's export the list
via /sys/kernel/cgroup/delegate pseudo-file.

Format is siple: each control file name is printed on a new line.
Example:
  $ cat /sys/kernel/cgroup/delegate
  cgroup.procs
  cgroup.subtree_control

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6ed725f36d9..eed92ed624e5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5832,3 +5832,64 @@ int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
 	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+				      ssize_t size, const char *prefix)
+{
+	struct cftype *cft;
+	ssize_t ret = 0;
+
+	for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+			continue;
+
+		if (prefix)
+			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+		ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+		if (unlikely(ret >= size)) {
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+	ssize_t ret = 0;
+
+	ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+				     NULL);
+
+	for_each_subsys(ss, ssid)
+		ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+					      PAGE_SIZE - ret,
+					      cgroup_subsys_name[ssid]);
+
+	return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+	&cgroup_delegate_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+	.attrs = cgroup_sysfs_attrs,
+	.name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
-- 
cgit v1.3-14-g43fede


From 5f2e673405b742be64e7c3604ed4ed3ac14f35ce Mon Sep 17 00:00:00 2001
From: Roman Gushchin <guro@fb.com>
Date: Mon, 6 Nov 2017 13:30:29 -0500
Subject: cgroup: export list of cgroups v2 features using sysfs

The active development of cgroups v2 sometimes leads to a creation
of interfaces, which are not turned on by default (to provide
backward compatibility). It's handy to know from userspace, which
cgroup v2 features are supported without calculating it based
on the kernel version. So, let's export the list of such features
using /sys/kernel/cgroup/features pseudo-file.

The list is hardcoded and has to be extended when new functionality
is added. Each feature is printed on a new line.

Example:
  $ cat /sys/kernel/cgroup/features
  nsdelegate

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: kernel-team@fb.com
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup/cgroup.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index eed92ed624e5..69e65d28fe98 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5877,8 +5877,16 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
 }
 static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
 
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
 static struct attribute *cgroup_sysfs_attrs[] = {
 	&cgroup_delegate_attr.attr,
+	&cgroup_features_attr.attr,
 	NULL,
 };
 
-- 
cgit v1.3-14-g43fede


From e846d13958066828a9483d862cc8370a72fadbb6 Mon Sep 17 00:00:00 2001
From: Zhou Chengming <zhouchengming1@huawei.com>
Date: Thu, 2 Nov 2017 09:18:21 +0800
Subject: kprobes, x86/alternatives: Use text_mutex to protect smp_alt_modules

We use alternatives_text_reserved() to check if the address is in
the fixed pieces of alternative reserved, but the problem is that
we don't hold the smp_alt mutex when call this function. So the list
traversal may encounter a deleted list_head if another path is doing
alternatives_smp_module_del().

One solution is that we can hold smp_alt mutex before call this
function, but the difficult point is that the callers of this
functions, arch_prepare_kprobe() and arch_prepare_optimized_kprobe(),
are called inside the text_mutex. So we must hold smp_alt mutex
before we go into these arch dependent code. But we can't now,
the smp_alt mutex is the arch dependent part, only x86 has it.
Maybe we can export another arch dependent callback to solve this.

But there is a simpler way to handle this problem. We can reuse the
text_mutex to protect smp_alt_modules instead of using another mutex.
And all the arch dependent checks of kprobes are inside the text_mutex,
so it's safe now.

Signed-off-by: Zhou Chengming <zhouchengming1@huawei.com>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bp@suse.de
Fixes: 2cfa197 "ftrace/alternatives: Introducing *_text_reserved functions"
Link: http://lkml.kernel.org/r/1509585501-79466-1-git-send-email-zhouchengming1@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/alternative.c | 26 +++++++++++++-------------
 kernel/extable.c              |  2 ++
 2 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d3382e91..dbaf14d69ebd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
 {
 	const s32 *poff;
 
-	mutex_lock(&text_mutex);
 	for (poff = start; poff < end; poff++) {
 		u8 *ptr = (u8 *)poff + *poff;
 
@@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
 		if (*ptr == 0x3e)
 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	}
-	mutex_unlock(&text_mutex);
 }
 
 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
@@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 {
 	const s32 *poff;
 
-	mutex_lock(&text_mutex);
 	for (poff = start; poff < end; poff++) {
 		u8 *ptr = (u8 *)poff + *poff;
 
@@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 		if (*ptr == 0xf0)
 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	}
-	mutex_unlock(&text_mutex);
 }
 
 struct smp_alt_module {
@@ -489,8 +485,7 @@ struct smp_alt_module {
 	struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static bool uniproc_patched = false;	/* protected by smp_alt */
+static bool uniproc_patched = false;	/* protected by text_mutex */
 
 void __init_or_module alternatives_smp_module_add(struct module *mod,
 						  char *name,
@@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 {
 	struct smp_alt_module *smp;
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 	if (!uniproc_patched)
 		goto unlock;
 
@@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 smp_unlock:
 	alternatives_smp_unlock(locks, locks_end, text, text_end);
 unlock:
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
 void __init_or_module alternatives_smp_module_del(struct module *mod)
 {
 	struct smp_alt_module *item;
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 	list_for_each_entry(item, &smp_alt_modules, next) {
 		if (mod != item->mod)
 			continue;
@@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
 		kfree(item);
 		break;
 	}
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
 void alternatives_enable_smp(void)
@@ -551,7 +546,7 @@ void alternatives_enable_smp(void)
 	/* Why bother if there are no other CPUs? */
 	BUG_ON(num_possible_cpus() == 1);
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 
 	if (uniproc_patched) {
 		pr_info("switching to SMP code\n");
@@ -563,10 +558,13 @@ void alternatives_enable_smp(void)
 					      mod->text, mod->text_end);
 		uniproc_patched = false;
 	}
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
@@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end)
 	u8 *text_start = start;
 	u8 *text_end = end;
 
+	lockdep_assert_held(&text_mutex);
+
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
diff --git a/kernel/extable.c b/kernel/extable.c
index 9aa1cc41ecf7..a17fdb63dc3e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -31,6 +31,8 @@
  * mutex protecting text section modification (dynamic code patching).
  * some users need to sleep (allocating memory...) while they hold this lock.
  *
+ * Note: Also protects SMP-alternatives modification on x86.
+ *
  * NOT exported to modules - patching kernel text is a really delicate matter.
  */
 DEFINE_MUTEX(text_mutex);
-- 
cgit v1.3-14-g43fede


From f791dd2589d7e217625d7e411621a5bac71cbf69 Mon Sep 17 00:00:00 2001
From: Cheng Jian <cj.chengjian@huawei.com>
Date: Fri, 3 Nov 2017 18:59:48 +0800
Subject: locking/rwlocks: Fix comments

- fix the list of locking API headers in kernel/locking/spinlock.c
- fix an #endif comment

Signed-off-by: Cheng Jian <cj.chengjian@huawei.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: huawei.libin@huawei.com
Cc: xiexiuqi@huawei.com
Link: http://lkml.kernel.org/r/1509706788-152547-1-git-send-email-cj.chengjian@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwlock_api_smp.h | 2 +-
 kernel/locking/spinlock.c      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rwlock_api_smp.h b/include/linux/rwlock_api_smp.h
index 5b9b84b20407..86ebb4bf9c6e 100644
--- a/include/linux/rwlock_api_smp.h
+++ b/include/linux/rwlock_api_smp.h
@@ -211,7 +211,7 @@ static inline void __raw_write_lock(rwlock_t *lock)
 	LOCK_CONTENDED(lock, do_raw_write_trylock, do_raw_write_lock);
 }
 
-#endif /* CONFIG_PREEMPT */
+#endif /* !CONFIG_GENERIC_LOCKBREAK || CONFIG_DEBUG_LOCK_ALLOC */
 
 static inline void __raw_write_unlock(rwlock_t *lock)
 {
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index b96343374a87..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -30,7 +30,8 @@
 #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
 /*
  * The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock   : include/linux/rwlock_api_smp.h
  */
 #else
 
-- 
cgit v1.3-14-g43fede


From 4ac2aed837cbdbb21c12a28c04718e34c1dc225f Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:50 -0500
Subject: resource: Consolidate resource walking code

The walk_iomem_res_desc(), walk_system_ram_res() and walk_system_ram_range()
functions each have much of the same code.

Create a new function that consolidates the common code from these
functions in one place to reduce the amount of duplicated code.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/20171020143059.3291-9-brijesh.singh@amd.com
---
 kernel/resource.c | 52 +++++++++++++++++++++++++---------------------------
 1 file changed, 25 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 9b5f04404152..7323c1b636cd 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -400,6 +400,26 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 	return 0;
 }
 
+static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+				 bool first_level_children_only,
+				 void *arg, int (*func)(u64, u64, void *))
+{
+	u64 orig_end = res->end;
+	int ret = -1;
+
+	while ((res->start < res->end) &&
+	       !find_next_iomem_res(res, desc, first_level_children_only)) {
+		ret = (*func)(res->start, res->end, arg);
+		if (ret)
+			break;
+
+		res->start = res->end + 1;
+		res->end = orig_end;
+	}
+
+	return ret;
+}
+
 /*
  * Walks through iomem resources and calls func() with matching resource
  * ranges. This walks through whole tree and not just first level children.
@@ -418,26 +438,12 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
 		u64 end, void *arg, int (*func)(u64, u64, void *))
 {
 	struct resource res;
-	u64 orig_end;
-	int ret = -1;
 
 	res.start = start;
 	res.end = end;
 	res.flags = flags;
-	orig_end = res.end;
-
-	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, desc, false))) {
-
-		ret = (*func)(res.start, res.end, arg);
-		if (ret)
-			break;
-
-		res.start = res.end + 1;
-		res.end = orig_end;
-	}
 
-	return ret;
+	return __walk_iomem_res_desc(&res, desc, false, arg, func);
 }
 
 /*
@@ -451,22 +457,13 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 				int (*func)(u64, u64, void *))
 {
 	struct resource res;
-	u64 orig_end;
-	int ret = -1;
 
 	res.start = start;
 	res.end = end;
 	res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-	orig_end = res.end;
-	while ((res.start < res.end) &&
-		(!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
-		ret = (*func)(res.start, res.end, arg);
-		if (ret)
-			break;
-		res.start = res.end + 1;
-		res.end = orig_end;
-	}
-	return ret;
+
+	return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+				     arg, func);
 }
 
 #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
@@ -508,6 +505,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
 {
 	return 1;
 }
+
 /*
  * This generic page_is_ram() returns true if specified address is
  * registered as System RAM in iomem_resource list.
-- 
cgit v1.3-14-g43fede


From 1d2e733b13b450e5854f4a8f8efcd77fa7362d62 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:51 -0500
Subject: resource: Provide resource struct in resource walk callback

In preperation for a new function that will need additional resource
information during the resource walk, update the resource walk callback to
pass the resource structure.  Since the current callback start and end
arguments are pulled from the resource structure, the callback functions
can obtain them from the resource structure directly.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: kvm@vger.kernel.org
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: linuxppc-dev@lists.ozlabs.org
Link: https://lkml.kernel.org/r/20171020143059.3291-10-brijesh.singh@amd.com
---
 arch/powerpc/kernel/machine_kexec_file_64.c | 12 +++++++++---
 arch/x86/kernel/crash.c                     | 18 +++++++++---------
 arch/x86/kernel/pmem.c                      |  2 +-
 include/linux/ioport.h                      |  4 ++--
 include/linux/kexec.h                       |  2 +-
 kernel/kexec_file.c                         |  5 +++--
 kernel/resource.c                           |  9 +++++----
 7 files changed, 30 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c
index 992c0d258e5d..e4395f937d63 100644
--- a/arch/powerpc/kernel/machine_kexec_file_64.c
+++ b/arch/powerpc/kernel/machine_kexec_file_64.c
@@ -91,11 +91,13 @@ int arch_kimage_file_post_load_cleanup(struct kimage *image)
  * and that value will be returned. If all free regions are visited without
  * func returning non-zero, then zero will be returned.
  */
-int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
+int arch_kexec_walk_mem(struct kexec_buf *kbuf,
+			int (*func)(struct resource *, void *))
 {
 	int ret = 0;
 	u64 i;
 	phys_addr_t mstart, mend;
+	struct resource res = { };
 
 	if (kbuf->top_down) {
 		for_each_free_mem_range_reverse(i, NUMA_NO_NODE, 0,
@@ -105,7 +107,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
 			 * range while in kexec, end points to the last byte
 			 * in the range.
 			 */
-			ret = func(mstart, mend - 1, kbuf);
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
 			if (ret)
 				break;
 		}
@@ -117,7 +121,9 @@ int arch_kexec_walk_mem(struct kexec_buf *kbuf, int (*func)(u64, u64, void *))
 			 * range while in kexec, end points to the last byte
 			 * in the range.
 			 */
-			ret = func(mstart, mend - 1, kbuf);
+			res.start = mstart;
+			res.end = mend - 1;
+			ret = func(&res, kbuf);
 			if (ret)
 				break;
 		}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..815008c9ca18 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
 {
 	unsigned int *nr_ranges = arg;
 
@@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
 	return ret;
 }
 
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
 {
 	struct crash_elf_data *ced = arg;
 	Elf64_Ehdr *ehdr;
@@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
 	ehdr = ced->ehdr;
 
 	/* Exclude unwanted mem ranges */
-	ret = elf_header_exclude_ranges(ced, start, end);
+	ret = elf_header_exclude_ranges(ced, res->start, res->end);
 	if (ret)
 		return ret;
 
@@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
 	return 0;
 }
 
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
 {
 	struct crash_memmap_data *cmd = arg;
 	struct boot_params *params = cmd->params;
 	struct e820_entry ei;
 
-	ei.addr = start;
-	ei.size = end - start + 1;
+	ei.addr = res->start;
+	ei.size = res->end - res->start + 1;
 	ei.type = cmd->type;
 	add_e820_entry(params, &ei);
 
@@ -619,12 +619,12 @@ out:
 	return ret;
 }
 
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
 {
 	struct kimage *image = arg;
 
-	image->arch.backup_src_start = start;
-	image->arch.backup_src_sz = end - start + 1;
+	image->arch.backup_src_start = res->start;
+	image->arch.backup_src_sz = res->end - res->start + 1;
 
 	/* Expecting only one range for backup region */
 	return 1;
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3fe690067802..6b07faaa1579 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
 {
 	return 1;
 }
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 83c8d6530f0f..c0070d7c4b99 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -272,10 +272,10 @@ walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
-		    int (*func)(u64, u64, void *));
+		    int (*func)(struct resource *, void *));
 extern int
 walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end,
-		    void *arg, int (*func)(u64, u64, void *));
+		    void *arg, int (*func)(struct resource *, void *));
 
 /* True if any part of r1 overlaps r2 */
 static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 1c08c925cefb..f16f6ceb3875 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -160,7 +160,7 @@ struct kexec_buf {
 };
 
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(u64, u64, void *));
+			       int (*func)(struct resource *, void *));
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
 #endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 9f48f4412297..e5bcd94c1efb 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -406,9 +406,10 @@ static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
 	return 1;
 }
 
-static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+static int locate_mem_hole_callback(struct resource *res, void *arg)
 {
 	struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+	u64 start = res->start, end = res->end;
 	unsigned long sz = end - start + 1;
 
 	/* Returning 0 will take to next memory range */
@@ -437,7 +438,7 @@ static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
  * func returning non-zero, then zero will be returned.
  */
 int __weak arch_kexec_walk_mem(struct kexec_buf *kbuf,
-			       int (*func)(u64, u64, void *))
+			       int (*func)(struct resource *, void *))
 {
 	if (kbuf->image->type == KEXEC_TYPE_CRASH)
 		return walk_iomem_res_desc(crashk_res.desc,
diff --git a/kernel/resource.c b/kernel/resource.c
index 7323c1b636cd..8430042fa77b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -402,14 +402,15 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 
 static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
 				 bool first_level_children_only,
-				 void *arg, int (*func)(u64, u64, void *))
+				 void *arg,
+				 int (*func)(struct resource *, void *))
 {
 	u64 orig_end = res->end;
 	int ret = -1;
 
 	while ((res->start < res->end) &&
 	       !find_next_iomem_res(res, desc, first_level_children_only)) {
-		ret = (*func)(res->start, res->end, arg);
+		ret = (*func)(res, arg);
 		if (ret)
 			break;
 
@@ -435,7 +436,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
  * <linux/ioport.h> and set it in 'desc' of a target resource entry.
  */
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
-		u64 end, void *arg, int (*func)(u64, u64, void *))
+		u64 end, void *arg, int (*func)(struct resource *, void *))
 {
 	struct resource res;
 
@@ -454,7 +455,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
  * ranges.
  */
 int walk_system_ram_res(u64 start, u64 end, void *arg,
-				int (*func)(u64, u64, void *))
+				int (*func)(struct resource *, void *))
 {
 	struct resource res;
 
-- 
cgit v1.3-14-g43fede


From 0e4c12b45aa88e74fdda117896d2b61c4e510cb9 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Fri, 20 Oct 2017 09:30:52 -0500
Subject: x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory
 pages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In order for memory pages to be properly mapped when SEV is active, it's
necessary to use the PAGE_KERNEL protection attribute as the base
protection.  This ensures that memory mapping of, e.g. ACPI tables,
receives the proper mapping attributes.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Brijesh Singh <brijesh.singh@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Tested-by: Borislav Petkov <bp@suse.de>
Cc: Laura Abbott <labbott@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: kvm@vger.kernel.org
Cc: Jérôme Glisse <jglisse@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Link: https://lkml.kernel.org/r/20171020143059.3291-11-brijesh.singh@amd.com
---
 arch/x86/mm/ioremap.c  | 79 ++++++++++++++++++++++++++++++++++++++++++--------
 include/linux/ioport.h |  3 ++
 kernel/resource.c      | 19 ++++++++++++
 3 files changed, 89 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 52cc0f4ed494..6e4573b1da34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -27,6 +27,11 @@
 
 #include "physaddr.h"
 
+struct ioremap_mem_flags {
+	bool system_ram;
+	bool desc_other;
+};
+
 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
@@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
-			       void *arg)
+static bool __ioremap_check_ram(struct resource *res)
 {
+	unsigned long start_pfn, stop_pfn;
 	unsigned long i;
 
-	for (i = 0; i < nr_pages; ++i)
-		if (pfn_valid(start_pfn + i) &&
-		    !PageReserved(pfn_to_page(start_pfn + i)))
-			return 1;
+	if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+		return false;
 
-	return 0;
+	start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+	if (stop_pfn > start_pfn) {
+		for (i = 0; i < (stop_pfn - start_pfn); ++i)
+			if (pfn_valid(start_pfn + i) &&
+			    !PageReserved(pfn_to_page(start_pfn + i)))
+				return true;
+	}
+
+	return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+	return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+	struct ioremap_mem_flags *flags = arg;
+
+	if (!flags->system_ram)
+		flags->system_ram = __ioremap_check_ram(res);
+
+	if (!flags->desc_other)
+		flags->desc_other = __ioremap_check_desc_other(res);
+
+	return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+				struct ioremap_mem_flags *flags)
+{
+	u64 start, end;
+
+	start = (u64)addr;
+	end = start + size - 1;
+	memset(flags, 0, sizeof(*flags));
+
+	walk_mem_res(start, end, flags, __ioremap_res_check);
 }
 
 /*
@@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, enum page_cache_mode pcm, void *caller)
 {
 	unsigned long offset, vaddr;
-	resource_size_t pfn, last_pfn, last_addr;
+	resource_size_t last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
+	struct ioremap_mem_flags mem_flags;
 	struct vm_struct *area;
 	enum page_cache_mode new_pcm;
 	pgprot_t prot;
@@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
+	__ioremap_check_mem(phys_addr, size, &mem_flags);
+
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	pfn      = phys_addr >> PAGE_SHIFT;
-	last_pfn = last_addr >> PAGE_SHIFT;
-	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
-					  __ioremap_check_ram) == 1) {
+	if (mem_flags.system_ram) {
 		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
 			  &phys_addr, &last_addr);
 		return NULL;
@@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		pcm = new_pcm;
 	}
 
+	/*
+	 * If the page being mapped is in memory and SEV is active then
+	 * make sure the memory encryption attribute is enabled in the
+	 * resulting mapping.
+	 */
 	prot = PAGE_KERNEL_IO;
+	if (sev_active() && mem_flags.desc_other)
+		prot = pgprot_encrypted(prot);
+
 	switch (pcm) {
 	case _PAGE_CACHE_MODE_UC:
 	default:
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index c0070d7c4b99..93b4183cf53d 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -271,6 +271,9 @@ extern int
 walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
 		void *arg, int (*func)(unsigned long, unsigned long, void *));
 extern int
+walk_mem_res(u64 start, u64 end, void *arg,
+	     int (*func)(struct resource *, void *));
+extern int
 walk_system_ram_res(u64 start, u64 end, void *arg,
 		    int (*func)(struct resource *, void *));
 extern int
diff --git a/kernel/resource.c b/kernel/resource.c
index 8430042fa77b..54ba6de3757c 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -397,6 +397,8 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
 		res->start = p->start;
 	if (res->end > p->end)
 		res->end = p->end;
+	res->flags = p->flags;
+	res->desc = p->desc;
 	return 0;
 }
 
@@ -467,6 +469,23 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 				     arg, func);
 }
 
+/*
+ * This function calls the @func callback against all memory ranges, which
+ * are ranges marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ */
+int walk_mem_res(u64 start, u64 end, void *arg,
+		 int (*func)(struct resource *, void *))
+{
+	struct resource res;
+
+	res.start = start;
+	res.end = end;
+	res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+	return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+				     arg, func);
+}
+
 #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
 
 /*
-- 
cgit v1.3-14-g43fede


From 11752adb68a388724b1935d57bf543897c34d80b Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Tue, 7 Nov 2017 16:18:06 -0500
Subject: locking/pvqspinlock: Implement hybrid PV queued/unfair locks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, all the lock waiters entering the slowpath will do one
lock stealing attempt to acquire the lock. That helps performance,
especially in VMs with over-committed vCPUs. However, the current
pvqspinlocks still don't perform as good as unfair locks in many cases.
On the other hands, unfair locks do have the problem of lock starvation
that pvqspinlocks don't have.

This patch combines the best attributes of an unfair lock and a
pvqspinlock into a hybrid lock with 2 modes - queued mode & unfair
mode. A lock waiter goes into the unfair mode when there are waiters
in the wait queue but the pending bit isn't set. Otherwise, it will
go into the queued mode waiting in the queue for its turn.

On a 2-socket 36-core E5-2699 v3 system (HT off), a kernel build
(make -j<n>) was done in a VM with unpinned vCPUs 3 times with the
best time selected and <n> is the number of vCPUs available. The build
times of the original pvqspinlock, hybrid pvqspinlock and unfair lock
with various number of vCPUs are as follows:

  vCPUs    pvqlock     hybrid pvqlock    unfair lock
  -----    -------     --------------    -----------
    30      342.1s         329.1s          329.1s
    36      314.1s         305.3s          307.3s
    45      345.0s         302.1s          306.6s
    54      365.4s         308.6s          307.8s
    72      358.9s         293.6s          303.9s
   108      343.0s         285.9s          304.2s

The hybrid pvqspinlock performs better or comparable to the unfair
lock.

By turning on QUEUED_LOCK_STAT, the table below showed the number
of lock acquisitions in unfair mode and queue mode after a kernel
build with various number of vCPUs.

  vCPUs    queued mode  unfair mode
  -----    -----------  -----------
    30      9,130,518      294,954
    36     10,856,614      386,809
    45      8,467,264   11,475,373
    54      6,409,987   19,670,855
    72      4,782,063   25,712,180

It can be seen that as the VM became more and more over-committed,
the ratio of locks acquired in unfair mode increases. This is all
done automatically to get the best overall performance as possible.

Using a kernel locking microbenchmark with number of locking
threads equals to the number of vCPUs available on the same machine,
the minimum, average and maximum (min/avg/max) numbers of locking
operations done per thread in a 5-second testing interval are shown
below:

  vCPUs         hybrid pvqlock             unfair lock
  -----         --------------             -----------
    36     822,135/881,063/950,363    75,570/313,496/  690,465
    54     542,435/581,664/625,937    35,460/204,280/  457,172
    72     397,500/428,177/499,299    17,933/150,679/  708,001
   108     257,898/288,150/340,871     3,085/181,176/1,257,109

It can be seen that the hybrid pvqspinlocks are more fair and
performant than the unfair locks in this test.

The table below shows the kernel build times on a smaller 2-socket
16-core 32-thread E5-2620 v4 system.

  vCPUs    pvqlock     hybrid pvqlock    unfair lock
  -----    -------     --------------    -----------
    16      436.8s         433.4s          435.6s
    36      366.2s         364.8s          364.5s
    48      423.6s         376.3s          370.2s
    64      433.1s         376.6s          376.8s

Again, the performance of the hybrid pvqspinlock was comparable to
that of the unfair lock.

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1510089486-3466-1-git-send-email-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock_paravirt.h | 47 ++++++++++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 15b6a39366c6..6ee477765e6c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -61,21 +61,50 @@ struct pv_node {
 #include "qspinlock_stat.h"
 
 /*
+ * Hybrid PV queued/unfair lock
+ *
  * By replacing the regular queued_spin_trylock() with the function below,
  * it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
  */
-#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l)	pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
 {
 	struct __qspinlock *l = (void *)lock;
 
-	if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
-	    (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
-		qstat_inc(qstat_pv_lock_stealing, true);
-		return true;
+	/*
+	 * Stay in unfair lock mode as long as queued mode waiters are
+	 * present in the MCS wait queue but the pending bit isn't set.
+	 */
+	for (;;) {
+		int val = atomic_read(&lock->val);
+
+		if (!(val & _Q_LOCKED_PENDING_MASK) &&
+		   (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+			qstat_inc(qstat_pv_lock_stealing, true);
+			return true;
+		}
+		if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+			break;
+
+		cpu_relax();
 	}
 
 	return false;
-- 
cgit v1.3-14-g43fede


From f71b74bca637fca7de78f1d44eaefde5bc900f9f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:18 +0100
Subject: irq/softirqs: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-3-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/softirq.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4e09821f9d9e..662f7b1b7a78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
 
 static void __local_bh_enable(unsigned int cnt)
 {
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	if (softirq_count() == (cnt & SOFTIRQ_MASK))
 		trace_softirqs_on(_RET_IP_);
@@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable);
 
 void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
 {
-	WARN_ON_ONCE(in_irq() || irqs_disabled());
+	WARN_ON_ONCE(in_irq());
+	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_TRACE_IRQFLAGS
 	local_irq_disable();
 #endif
@@ -396,9 +397,8 @@ void irq_exit(void)
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
 	local_irq_disable();
 #else
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 #endif
-
 	account_irq_exit_time(current);
 	preempt_count_sub(HARDIRQ_OFFSET);
 	if (!in_interrupt() && local_softirq_pending())
@@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
 
 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
 {
-	BUG_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	t->next = __this_cpu_read(tasklet_hi_vec.head);
 	__this_cpu_write(tasklet_hi_vec.head, t);
-- 
cgit v1.3-14-g43fede


From 8e8eb730759f9cfd7a761b0b4ee41d714e720993 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:19 +0100
Subject: workqueue: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tejun Heo <tj@kernel.org>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1509980490-4285-4-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1070b21ba4aa..13f67b5a0a0c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	 * queued or lose PENDING.  Grabbing PENDING and queueing should
 	 * happen with IRQ disabled.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	debug_work_activate(work);
 
-- 
cgit v1.3-14-g43fede


From ebf3adbad012b89c4a51a3beae718a587d988a3a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:20 +0100
Subject: timers/nohz: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-5-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c7a899c5ce64..dd4b7b492c9b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep)
 
 static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
 {
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	if (unlikely(!cpu_online(cpu)))
 		return false;
@@ -960,8 +960,7 @@ void tick_nohz_idle_enter(void)
 {
 	struct tick_sched *ts;
 
-	WARN_ON_ONCE(irqs_disabled());
-
+	lockdep_assert_irqs_enabled();
 	/*
 	 * Update the idle state in the scheduler domain hierarchy
 	 * when tick_nohz_stop_sched_tick() is called from the idle loop.
-- 
cgit v1.3-14-g43fede


From 53bef3fd47f69e40b52c9f9acd3551dfff9f8702 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:21 +0100
Subject: timers/hrtimer: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-6-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/hrtimer.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88f75f92ef36..d32520840fde 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,7 @@ void clock_was_set(void)
  */
 void hrtimers_resume(void)
 {
-	WARN_ONCE(!irqs_disabled(),
-		  KERN_INFO "hrtimers_resume() called with IRQs enabled!");
-
+	lockdep_assert_irqs_disabled();
 	/* Retrigger on the local CPU */
 	retrigger_next_event(NULL);
 	/* And schedule a retrigger for all others */
-- 
cgit v1.3-14-g43fede


From 83efcbd028ad3aec36b5a3882cfa32490c135df7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:22 +0100
Subject: smp/core: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-7-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index c94dd85c8d41..084c8b3a2681 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 	call_single_data_t *csd, *csd_next;
 	static bool warned;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	head = this_cpu_ptr(&call_single_queue);
 	entry = llist_del_all(head);
-- 
cgit v1.3-14-g43fede


From 164446455a5d3f1402b5a0ea42acce33fd576ed7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:24 +0100
Subject: perf/core: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-9-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index b315aebbcc3f..c298847d4b85 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -209,7 +209,7 @@ static int event_function(void *info)
 	struct perf_event_context *task_ctx = cpuctx->task_ctx;
 	int ret = 0;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	perf_ctx_lock(cpuctx, task_ctx);
 	/*
@@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
 	struct task_struct *task = READ_ONCE(ctx->task);
 	struct perf_event_context *task_ctx = NULL;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	if (task) {
 		if (task == TASK_TOMBSTONE)
@@ -1006,7 +1006,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 	struct perf_cpu_context *cpuctx;
 	int rotations = 0;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
 	rotations = perf_rotate_context(cpuctx);
@@ -1093,7 +1093,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
 {
 	struct list_head *head = this_cpu_ptr(&active_ctx_list);
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	WARN_ON(!list_empty(&ctx->active_ctx_list));
 
@@ -1102,7 +1102,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
 
 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
 {
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	WARN_ON(list_empty(&ctx->active_ctx_list));
 
@@ -3523,7 +3523,7 @@ void perf_event_task_tick(void)
 	struct perf_event_context *ctx, *tmp;
 	int throttled;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	__this_cpu_inc(perf_throttled_seq);
 	throttled = __this_cpu_xchg(perf_throttled_count, 0);
-- 
cgit v1.3-14-g43fede


From a934d4d15f040cdd254350e7270b35cc7521e12e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:25 +0100
Subject: irq/timings: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-10-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq/timings.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c8c1d073fbf1..e0923fa4927a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now)
 	 * order to prevent the timings circular buffer to be updated
 	 * while we are reading it.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	/*
 	 * Number of elements in the circular buffer: If it happens it
-- 
cgit v1.3-14-g43fede


From 3c7169a3bf8216a56761a8edf775072dd36a00a0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:26 +0100
Subject: irq_work: Use lockdep to assert IRQs are disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-11-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/irq_work.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..899579657a0a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -188,7 +188,7 @@ void irq_work_tick(void)
  */
 void irq_work_sync(struct irq_work *work)
 {
-	WARN_ON_ONCE(irqs_disabled());
+	lockdep_assert_irqs_enabled();
 
 	while (work->flags & IRQ_WORK_BUSY)
 		cpu_relax();
-- 
cgit v1.3-14-g43fede


From 2c11dba00a39007b457c7607c1b1a4db95ca04bc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:27 +0100
Subject: sched/clock, sched/cputime: Use lockdep to assert IRQs are
 disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-12-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c   | 2 +-
 kernel/sched/cputime.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..e086babe6c61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -388,7 +388,7 @@ void sched_clock_tick(void)
 	if (unlikely(!sched_clock_running))
 		return;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	scd = this_scd();
 	__scd_stamp(scd);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..9be8b68a66da 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max)
 {
 	u64 accounted;
 
-	/* Shall be converted to a lockdep-enabled lightweight check */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	accounted = steal_account_process_time(max);
 
-- 
cgit v1.3-14-g43fede


From a69682200db9c2c26594188f81dd2df560af4683 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:28 +0100
Subject: timers/posix-cpu-timers: Use lockdep to assert IRQs are
 disabled/enabled

Use lockdep to check that IRQs are enabled or disabled as expected. This
way the sanity check only shows overhead when concurrency correctness
debug code is enabled.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-13-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/posix-cpu-timers.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5b117110b55b..1f27887aa194 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
@@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 	arm_timer(timer);
 unlock:
 	unlock_task_sighand(p, &flags);
@@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 
-	WARN_ON_ONCE(!irqs_disabled());
+	lockdep_assert_irqs_disabled();
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
-- 
cgit v1.3-14-g43fede


From b04db8e19fc2e9131524dec43057c1b96d5ba3ba Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Mon, 6 Nov 2017 16:01:30 +0100
Subject: rcu: Use lockdep to assert IRQs are disabled/enabled

Lockdep now has an integrated IRQs disabled/enabled sanity check. Just
use it instead of the ad-hoc RCU version.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David S . Miller <davem@davemloft.net>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1509980490-4285-15-git-send-email-frederic@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/rcu/tree.c        | 16 ++++++++--------
 kernel/rcu/tree_plugin.h | 10 +++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3e3650e94ae6..08fa586d7f8c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 	int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
 	int *fp = &rnp->need_future_gp[idx];
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	return READ_ONCE(*fp);
 }
 
@@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 static bool
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	if (rcu_gp_in_progress(rsp))
 		return false;  /* No, a grace period is already in progress. */
 	if (rcu_future_needs_gp(rsp))
@@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user)
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
 	if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 	    !user && !is_idle_task(current)) {
@@ -840,7 +840,7 @@ static void rcu_eqs_enter(bool user)
  */
 void rcu_idle_enter(void)
 {
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	rcu_eqs_enter(false);
 }
 
@@ -855,7 +855,7 @@ void rcu_idle_enter(void)
  */
 void rcu_user_enter(void)
 {
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	rcu_eqs_enter(true);
 }
 #endif /* CONFIG_NO_HZ_FULL */
@@ -880,7 +880,7 @@ void rcu_irq_exit(void)
 {
 	struct rcu_dynticks *rdtp;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	/* Page faults can happen in NMI handlers, so check... */
@@ -947,7 +947,7 @@ static void rcu_eqs_exit(bool user)
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 	oldval = rdtp->dynticks_nesting;
 	WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -1018,7 +1018,7 @@ void rcu_irq_enter(void)
 	struct rcu_dynticks *rdtp;
 	long long oldval;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	rdtp = this_cpu_ptr(&rcu_dynticks);
 
 	/* Page faults can happen in NMI handlers, so check... */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e012b9be777e..df08e5c8126b 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -325,7 +325,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+	lockdep_assert_irqs_disabled();
 	WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
 	if (t->rcu_read_lock_nesting > 0 &&
 	    !t->rcu_read_unlock_special.b.blocked) {
@@ -1421,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
 	unsigned long dj;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1470,7 +1470,7 @@ static void rcu_prepare_for_idle(void)
 	struct rcu_state *rsp;
 	int tne;
 
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	if (rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 
@@ -1525,7 +1525,7 @@ static void rcu_prepare_for_idle(void)
  */
 static void rcu_cleanup_after_idle(void)
 {
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	if (rcu_is_nocb_cpu(smp_processor_id()))
 		return;
 	if (rcu_try_advance_all_cbs())
@@ -2012,7 +2012,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
 						     struct rcu_data *rdp,
 						     unsigned long flags)
 {
-	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
+	lockdep_assert_irqs_disabled();
 	if (!rcu_is_nocb_cpu(smp_processor_id()))
 		return false; /* Not NOCBs CPU, caller must migrate CBs. */
 	__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
-- 
cgit v1.3-14-g43fede


From fda784e50aace694ec2e4e16e2de07b91a938563 Mon Sep 17 00:00:00 2001
From: "Bruno E. O. Meneguele" <brdeoliv@redhat.com>
Date: Tue, 24 Oct 2017 15:37:00 -0200
Subject: module: export module signature enforcement status

A static variable sig_enforce is used as status var to indicate the real
value of CONFIG_MODULE_SIG_FORCE, once this one is set the var will hold
true, but if the CONFIG is not set the status var will hold whatever
value is present in the module.sig_enforce kernel cmdline param: true
when =1 and false when =0 or not present.

Considering this cmdline param take place over the CONFIG value when
it's not set, other places in the kernel could misbehave since they
would have only the CONFIG_MODULE_SIG_FORCE value to rely on. Exporting
this status var allows the kernel to rely in the effective value of
module signature enforcement, being it from CONFIG value or cmdline
param.

Signed-off-by: Bruno E. O. Meneguele <brdeoliv@redhat.com>
Signed-off-by: Mimi Zohar <zohar@linux.vnet.ibm.com>
---
 include/linux/module.h |  7 +++++++
 kernel/module.c        | 10 ++++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index fe5aa3736707..c69b49abe877 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -639,6 +639,8 @@ static inline bool is_livepatch_module(struct module *mod)
 }
 #endif /* CONFIG_LIVEPATCH */
 
+bool is_module_sig_enforced(void);
+
 #else /* !CONFIG_MODULES... */
 
 static inline struct module *__module_address(unsigned long addr)
@@ -753,6 +755,11 @@ static inline bool module_requested_async_probing(struct module *module)
 	return false;
 }
 
+static inline bool is_module_sig_enforced(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_MODULES */
 
 #ifdef CONFIG_SYSFS
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..d1c194b057a2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
 module_param(sig_enforce, bool_enable_only, 0644);
 #endif /* !CONFIG_MODULE_SIG_FORCE */
 
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+	return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
 /* Block module loading/unloading? */
 int modules_disabled = 0;
 core_param(nomodule, modules_disabled, bint, 0);
-- 
cgit v1.3-14-g43fede


From c0f3ea1589394deac2d840c685f57c69e4ac4243 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 8 Nov 2017 12:51:04 -0800
Subject: stop using '%pK' for /proc/kallsyms pointer values

Not only is it annoying to have one single flag for all pointers, as if
that was a global choice and all kernel pointers are the same, but %pK
can't get the 'access' vs 'open' time check right anyway.

So make the /proc/kallsyms pointer value code use logic specific to that
particular file.  We do continue to honor kptr_restrict, but the default
(which is unrestricted) is changed to instead take expected users into
account, and restrict access by default.

Right now the only actual expected user is kernel profiling, which has a
separate sysctl flag for kernel profile access.  There may be others.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kallsyms.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..51b49ed452e4 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -480,6 +480,7 @@ struct kallsym_iter {
 	char name[KSYM_NAME_LEN];
 	char module_name[MODULE_NAME_LEN];
 	int exported;
+	int show_value;
 };
 
 static int get_ksymbol_mod(struct kallsym_iter *iter)
@@ -580,14 +581,23 @@ static void s_stop(struct seq_file *m, void *p)
 {
 }
 
+#ifndef CONFIG_64BIT
+# define KALLSYM_FMT "%08lx"
+#else
+# define KALLSYM_FMT "%016lx"
+#endif
+
 static int s_show(struct seq_file *m, void *p)
 {
+	unsigned long value;
 	struct kallsym_iter *iter = m->private;
 
 	/* Some debugging symbols have no name.  Ignore them. */
 	if (!iter->name[0])
 		return 0;
 
+	value = iter->show_value ? iter->value : 0;
+
 	if (iter->module_name[0]) {
 		char type;
 
@@ -597,10 +607,10 @@ static int s_show(struct seq_file *m, void *p)
 		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
-		seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+		seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
 			   type, iter->name, iter->module_name);
 	} else
-		seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+		seq_printf(m, KALLSYM_FMT " %c %s\n", value,
 			   iter->type, iter->name);
 	return 0;
 }
@@ -612,6 +622,40 @@ static const struct seq_operations kallsyms_op = {
 	.show = s_show
 };
 
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+	extern int sysctl_perf_event_paranoid;
+	if (sysctl_perf_event_paranoid <= 1)
+		return 1;
+#endif
+	return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+static int kallsyms_show_value(void)
+{
+	switch (kptr_restrict) {
+	case 0:
+		if (kallsyms_for_perf())
+			return 1;
+	/* fallthrough */
+	case 1:
+		if (has_capability_noaudit(current, CAP_SYSLOG))
+			return 1;
+	/* fallthrough */
+	default:
+		return 0;
+	}
+}
+
 static int kallsyms_open(struct inode *inode, struct file *file)
 {
 	/*
@@ -625,6 +669,7 @@ static int kallsyms_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	reset_iter(iter, 0);
 
+	iter->show_value = kallsyms_show_value();
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 95b982b45122c57da2ee0b46cce70775e1d987af Mon Sep 17 00:00:00 2001
From: Rajat Jain <rajatja@google.com>
Date: Tue, 31 Oct 2017 14:44:24 -0700
Subject: PM / s2idle: Clear the events_check_enabled flag

Problem: This flag does not get cleared currently in the suspend or
resume path in the following cases:

 * In case some driver's suspend routine returns an error.
 * Successful s2idle case
 * etc?

Why is this a problem: What happens is that the next suspend attempt
could fail even though the user did not enable the flag by writing to
/sys/power/wakeup_count. This is 1 use case how the issue can be seen
(but similar use case with driver suspend failure can be thought of):

 1. Read /sys/power/wakeup_count
 2. echo count > /sys/power/wakeup_count
 3. echo freeze > /sys/power/wakeup_count
 4. Let the system suspend, and wakeup the system using some wake source
    that calls pm_wakeup_event() e.g. power button or something.
 5. Note that the combined wakeup count would be incremented due
    to the pm_wakeup_event() in the resume path.
 6. After resuming the events_check_enabled flag is still set.

At this point if the user attempts to freeze again (without writing to
/sys/power/wakeup_count), the suspend would fail even though there has
been no wake event since the past resume.

Address that by clearing the flag just before a resume is completed,
so that it is always cleared for the corner cases mentioned above.

Signed-off-by: Rajat Jain <rajatja@google.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ccd2d20e6b06..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			error = suspend_ops->enter(state);
 			trace_suspend_resume(TPS("machine_suspend"),
 				state, false);
-			events_check_enabled = false;
 		} else if (*wakeup) {
 			error = -EBUSY;
 		}
@@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state)
 	pm_restore_gfp_mask();
 
  Finish:
+	events_check_enabled = false;
 	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
-- 
cgit v1.3-14-g43fede


From 07458f6a5171d97511dfbdf6ce549ed2ca0280c7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 8 Nov 2017 20:23:55 +0530
Subject: cpufreq: schedutil: Reset cached_raw_freq when not in sync with
 next_freq

'cached_raw_freq' is used to get the next frequency quickly but should
always be in sync with sg_policy->next_freq. There is a case where it is
not and in such cases it should be reset to avoid switching to incorrect
frequencies.

Consider this case for example:

 - policy->cur is 1.2 GHz (Max)
 - New request comes for 780 MHz and we store that in cached_raw_freq.
 - Based on 780 MHz, we calculate the effective frequency as 800 MHz.
 - We then see the CPU wasn't idle recently and choose to keep the next
   freq as 1.2 GHz.
 - Now we have cached_raw_freq is 780 MHz and sg_policy->next_freq is
   1.2 GHz.
 - Now if the utilization doesn't change in then next request, then the
   next target frequency will still be 780 MHz and it will match with
   cached_raw_freq. But we will choose 1.2 GHz instead of 800 MHz here.

Fixes: b7eaf1aab9f8 (cpufreq: schedutil: Avoid reducing frequency of busy CPUs prematurely)
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: 4.12+ <stable@vger.kernel.org> # 4.12+
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ba0da243fdd8..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		 * Do not reduce the frequency if the CPU has not been idle
 		 * recently, as the reduction is likely to be premature then.
 		 */
-		if (busy && next_f < sg_policy->next_freq)
+		if (busy && next_f < sg_policy->next_freq) {
 			next_f = sg_policy->next_freq;
+
+			/* Reset cached freq as next_freq has changed */
+			sg_policy->cached_raw_freq = 0;
+		}
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
-- 
cgit v1.3-14-g43fede


From 765cc3a4b224e22bf524fabe40284a524f37cdd0 Mon Sep 17 00:00:00 2001
From: Patrick Bellasi <patrick.bellasi@arm.com>
Date: Wed, 8 Nov 2017 18:41:01 +0000
Subject: sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds

When the kernel is compiled with !CONFIG_SCHED_DEBUG support, we expect that
all SCHED_FEAT are turned into compile time constants being propagated
to support compiler optimizations.

Specifically, we expect that code blocks like this:

   if (sched_feat(FEATURE_NAME) [&& <other_conditions>]) {
	/* FEATURE CODE */
   }

are turned into dead-code in case FEATURE_NAME defaults to FALSE, and thus
being removed by the compiler from the finale image.

For this mechanism to properly work it's required for the compiler to
have full access, from each translation unit, to whatever is the value
defined by the sched_feat macro. This macro is defined as:

   #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))

and thus, the compiler can optimize that code only if the value of
sysctl_sched_features is visible within each translation unit.

Since:

   029632fbb ("sched: Make separate sched*.c translation units")

the scheduler code has been split into separate translation units
however the definition of sysctl_sched_features is part of
kernel/sched/core.c while, for all the other scheduler modules, it is
visible only via kernel/sched/sched.h as an:

   extern const_debug unsigned int sysctl_sched_features

Unfortunately, an extern reference does not allow the compiler to apply
constants propagation. Thus, on !CONFIG_SCHED_DEBUG kernel we still end up
with code to load a memory reference and (eventually) doing an unconditional
jump of a chunk of code.

This mechanism is unavoidable when sched_features can be turned on and off at
run-time. However, this is not the case for "production" kernels compiled with
!CONFIG_SCHED_DEBUG. In this case, sysctl_sched_features is just a constant value
which cannot be changed at run-time and thus memory loads and jumps can be
avoided altogether.

This patch fixes the case of !CONFIG_SCHED_DEBUG kernel by declaring a local version
of the sysctl_sched_features constant for each translation unit. This will
ultimately allow the compiler to perform constants propagation and dead-code
pruning.

Tests have been done, with !CONFIG_SCHED_DEBUG on a v4.14-rc8 with and without
the patch, by running 30 iterations of:

   perf bench sched messaging --pipe --thread --group 4 --loop 50000

on a 40 cores Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz using the
powersave governor to rule out variations due to frequency scaling.

Statistics on the reported completion time:

                   count     mean       std     min       99%     max
  v4.14-rc8         30.0  15.7831  0.176032  15.442  16.01226  16.014
  v4.14-rc8+patch   30.0  15.5033  0.189681  15.232  15.93938  15.962

... show a 1.8% speedup on average completion time and 0.5% speedup in the
99 percentile.

Signed-off-by: Patrick Bellasi <patrick.bellasi@arm.com>
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
Reviewed-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Reviewed-by: Brendan Jackman <brendan.jackman@arm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Morten Rasmussen <morten.rasmussen@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/20171108184101.16006-1-patrick.bellasi@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  |  9 ++++++---
 kernel/sched/sched.h | 25 ++++++++++++++++++++++---
 2 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1a55c842bfbc..a39a08113003 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -43,18 +43,21 @@
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
 /*
  * Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
  */
-
 #define SCHED_FEAT(name, enabled)	\
 	(1UL << __SCHED_FEAT_##name) * enabled |
-
 const_debug unsigned int sysctl_sched_features =
 #include "features.h"
 	0;
-
 #undef SCHED_FEAT
+#endif
 
 /*
  * Number of tasks to iterate in a single balance run.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 58787e3631c7..45ab0bf564e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1233,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 # define const_debug const
 #endif
 
-extern const_debug unsigned int sysctl_sched_features;
-
 #define SCHED_FEAT(name, enabled)	\
 	__SCHED_FEAT_##name ,
 
@@ -1246,6 +1244,13 @@ enum {
 #undef SCHED_FEAT
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+
+/*
+ * To support run-time toggling of sched features, all the translation units
+ * (but core.c) reference the sysctl_sched_features defined in core.c.
+ */
+extern const_debug unsigned int sysctl_sched_features;
+
 #define SCHED_FEAT(name, enabled)					\
 static __always_inline bool static_branch_##name(struct static_key *key) \
 {									\
@@ -1253,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
 }
 
 #include "features.h"
-
 #undef SCHED_FEAT
 
 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+
 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+
+/*
+ * Each translation unit has its own copy of sysctl_sched_features to allow
+ * constants propagation at compile time and compiler optimization based on
+ * features default.
+ */
+#define SCHED_FEAT(name, enabled)	\
+	(1UL << __SCHED_FEAT_##name) * enabled |
+static const_debug __maybe_unused unsigned int sysctl_sched_features =
+#include "features.h"
+	0;
+#undef SCHED_FEAT
+
 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
 
 extern struct static_key_false sched_numa_balancing;
-- 
cgit v1.3-14-g43fede


From 4f8413a3a799c958f7a10a6310a451e6b8aef5ad Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 9 Nov 2017 14:17:59 +0000
Subject: genirq: Track whether the trigger type has been set

When requesting a shared interrupt, we assume that the firmware
support code (DT or ACPI) has called irqd_set_trigger_type
already, so that we can retrieve it and check that the requester
is being reasonnable.

Unfortunately, we still have non-DT, non-ACPI systems around,
and these guys won't call irqd_set_trigger_type before requesting
the interrupt. The consequence is that we fail the request that
would have worked before.

We can either chase all these use cases (boring), or address it
in core code (easier). Let's have a per-irq_desc flag that
indicates whether irqd_set_trigger_type has been called, and
let's just check it when checking for a shared interrupt.
If it hasn't been set, just take whatever the interrupt
requester asks.

Fixes: 382bd4de6182 ("genirq: Use irqd_get_trigger_type to compare the trigger type for shared IRQs")
Cc: stable@vger.kernel.org
Reported-and-tested-by: Petr Cvek <petrcvekcz@gmail.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 include/linux/irq.h | 11 ++++++++++-
 kernel/irq/manage.c | 13 ++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fda8da7c45e7..73f61eeb152e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -210,6 +210,7 @@ struct irq_data {
  * IRQD_MANAGED_SHUTDOWN	- Interrupt was shutdown due to empty affinity
  *				  mask. Applies only to affinity managed irqs.
  * IRQD_SINGLE_TARGET		- IRQ allows only a single affinity target
+ * IRQD_DEFAULT_TRIGGER_SET	- Expected trigger already been set
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -230,6 +231,7 @@ enum {
 	IRQD_IRQ_STARTED		= (1 << 22),
 	IRQD_MANAGED_SHUTDOWN		= (1 << 23),
 	IRQD_SINGLE_TARGET		= (1 << 24),
+	IRQD_DEFAULT_TRIGGER_SET	= (1 << 25),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -259,18 +261,25 @@ static inline void irqd_mark_affinity_was_set(struct irq_data *d)
 	__irqd_to_state(d) |= IRQD_AFFINITY_SET;
 }
 
+static inline bool irqd_trigger_type_was_set(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_DEFAULT_TRIGGER_SET;
+}
+
 static inline u32 irqd_get_trigger_type(struct irq_data *d)
 {
 	return __irqd_to_state(d) & IRQD_TRIGGER_MASK;
 }
 
 /*
- * Must only be called inside irq_chip.irq_set_type() functions.
+ * Must only be called inside irq_chip.irq_set_type() functions or
+ * from the DT/ACPI setup code.
  */
 static inline void irqd_set_trigger_type(struct irq_data *d, u32 type)
 {
 	__irqd_to_state(d) &= ~IRQD_TRIGGER_MASK;
 	__irqd_to_state(d) |= type & IRQD_TRIGGER_MASK;
+	__irqd_to_state(d) |= IRQD_DEFAULT_TRIGGER_SET;
 }
 
 static inline bool irqd_is_level_type(struct irq_data *d)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e667912d0e9c..21e04e780be4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1228,7 +1228,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * set the trigger type must match. Also all must
 		 * agree on ONESHOT.
 		 */
-		unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+		unsigned int oldtype;
+
+		/*
+		 * If nobody did set the configuration before, inherit
+		 * the one provided by the requester.
+		 */
+		if (irqd_trigger_type_was_set(&desc->irq_data)) {
+			oldtype = irqd_get_trigger_type(&desc->irq_data);
+		} else {
+			oldtype = new->flags & IRQF_TRIGGER_MASK;
+			irqd_set_trigger_type(&desc->irq_data, oldtype);
+		}
 
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
 		    (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
-- 
cgit v1.3-14-g43fede


From 173743dd99a49c956b124a74c8aacb0384739a4c Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:34 -0400
Subject: audit: ensure that 'audit=1' actually enables audit for PID 1

Prior to this patch we enabled audit in audit_init(), which is too
late for PID 1 as the standard initcalls are run after the PID 1 task
is forked.  This means that we never allocate an audit_context (see
audit_alloc()) for PID 1 and therefore miss a lot of audit events
generated by PID 1.

This patch enables audit as early as possible to help ensure that when
PID 1 is forked it can allocate an audit_context if required.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..ec3d0802734d 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int	audit_initialized;
 #define AUDIT_OFF	0
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
-u32		audit_enabled;
-u32		audit_ever_enabled;
+u32		audit_enabled = AUDIT_OFF;
+u32		audit_ever_enabled = !!AUDIT_OFF;
 
 EXPORT_SYMBOL_GPL(audit_enabled);
 
 /* Default state when kernel boots without any parameters. */
-static u32	audit_default;
+static u32	audit_default = AUDIT_OFF;
 
 /* If auditing cannot proceed, audit_failure selects what happens. */
 static u32	audit_failure = AUDIT_FAIL_PRINTK;
@@ -1549,8 +1549,6 @@ static int __init audit_init(void)
 	register_pernet_subsys(&audit_net_ops);
 
 	audit_initialized = AUDIT_INITIALIZED;
-	audit_enabled = audit_default;
-	audit_ever_enabled |= !!audit_default;
 
 	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 	if (IS_ERR(kauditd_task)) {
@@ -1572,6 +1570,8 @@ static int __init audit_enable(char *str)
 	audit_default = !!simple_strtol(str, NULL, 0);
 	if (!audit_default)
 		audit_initialized = AUDIT_DISABLED;
+	audit_enabled = audit_default;
+	audit_ever_enabled = !!audit_enabled;
 
 	pr_info("%s\n", audit_default ?
 		"enabled (after initialization)" : "disabled (until reboot)");
-- 
cgit v1.3-14-g43fede


From be4104abf25c63ccef36e1e06262c73c0df9fd60 Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:44 -0400
Subject: audit: initialize the audit subsystem as early as possible

We can't initialize the audit subsystem until after the network layer
is initialized (core_initcall), but do it soon after.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ec3d0802734d..db71c45ea6f8 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1562,7 +1562,7 @@ static int __init audit_init(void)
 
 	return 0;
 }
-__initcall(audit_init);
+postcore_initcall(audit_init);
 
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
 static int __init audit_enable(char *str)
-- 
cgit v1.3-14-g43fede


From 80ab4df62706b882922c3bb0b05ce2c8ab10828a Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:51 -0400
Subject: audit: don't use simple_strtol() anymore

The simple_strtol() function is deprecated, use kstrtol() instead.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index db71c45ea6f8..e75918f79a83 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1567,8 +1567,13 @@ postcore_initcall(audit_init);
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
 static int __init audit_enable(char *str)
 {
-	audit_default = !!simple_strtol(str, NULL, 0);
-	if (!audit_default)
+	long val;
+
+	if (kstrtol(str, 0, &val))
+		panic("audit: invalid 'audit' parameter value (%s)\n", str);
+	audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+	if (audit_default == AUDIT_OFF)
 		audit_initialized = AUDIT_DISABLED;
 	audit_enabled = audit_default;
 	audit_ever_enabled = !!audit_enabled;
-- 
cgit v1.3-14-g43fede


From b3b4fdf6a8ae9ea51ea274e3f2f8a6a58b98cc0b Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:44:57 -0400
Subject: audit: convert audit_ever_enabled to a boolean

We were treating it as a boolean, let's make it a boolean to help
avoid future mistakes.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 2 +-
 kernel/audit.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index e75918f79a83..f0cf9bfc806c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -86,7 +86,7 @@ static int	audit_initialized;
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
 u32		audit_enabled = AUDIT_OFF;
-u32		audit_ever_enabled = !!AUDIT_OFF;
+bool		audit_ever_enabled = !!AUDIT_OFF;
 
 EXPORT_SYMBOL_GPL(audit_enabled);
 
diff --git a/kernel/audit.h b/kernel/audit.h
index b331d9b83f63..6bdaf6bd377e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
 	struct audit_proctitle proctitle;
 };
 
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
 
 extern void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
-- 
cgit v1.3-14-g43fede


From 5d842a5b77a58160625548fa6be2dc159179ffdb Mon Sep 17 00:00:00 2001
From: Paul Moore <paul@paul-moore.com>
Date: Fri, 1 Sep 2017 09:45:05 -0400
Subject: audit: use audit_set_enabled() in audit_enable()

Use audit_set_enabled() to enable auditing during early boot.  This
obviously won't emit an audit change record, but it will work anyway
and should help prevent in future problems by consolidating the
enable/disable code in one function.

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index f0cf9bfc806c..67b3863261d4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1575,8 +1575,8 @@ static int __init audit_enable(char *str)
 
 	if (audit_default == AUDIT_OFF)
 		audit_initialized = AUDIT_DISABLED;
-	audit_enabled = audit_default;
-	audit_ever_enabled = !!audit_enabled;
+	if (audit_set_enabled(audit_default))
+		panic("audit: error setting audit state (%d)\n", audit_default);
 
 	pr_info("%s\n", audit_default ?
 		"enabled (after initialization)" : "disabled (until reboot)");
-- 
cgit v1.3-14-g43fede


From 33e8a907804428109ce1d12301c3365d619cc4df Mon Sep 17 00:00:00 2001
From: Steve Grubb <sgrubb@redhat.com>
Date: Tue, 17 Oct 2017 18:29:22 -0400
Subject: audit: Allow auditd to set pid to 0 to end auditing

The API to end auditing has historically been for auditd to set the
pid to 0. This patch restores that functionality.

See: https://github.com/linux-audit/audit-kernel/issues/69

Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 67b3863261d4..64e1d0ec19de 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			pid_t auditd_pid;
 			struct pid *req_pid = task_tgid(current);
 
-			/* sanity check - PID values must match */
-			if (new_pid != pid_vnr(req_pid))
+			/* Sanity check - PID values must match. Setting
+			 * pid to 0 is how auditd ends auditing. */
+			if (new_pid && (new_pid != pid_vnr(req_pid)))
 				return -EINVAL;
 
 			/* test the auditd connection */
 			audit_replace(req_pid);
 
 			auditd_pid = auditd_pid_vnr();
-			/* only the current auditd can unregister itself */
-			if ((!new_pid) && (new_pid != auditd_pid)) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EACCES;
-			}
-			/* replacing a healthy auditd is not allowed */
-			if (auditd_pid && new_pid) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EEXIST;
+			if (auditd_pid) {
+				/* replacing a healthy auditd is not allowed */
+				if (new_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EEXIST;
+				}
+				/* only current auditd can unregister itself */
+				if (pid_vnr(req_pid) != auditd_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EACCES;
+				}
 			}
 
 			if (new_pid) {
-- 
cgit v1.3-14-g43fede


From f7b53637c090bd8ce2dc74ad0f3aa1898aff2524 Mon Sep 17 00:00:00 2001
From: Casey Schaufler <casey@schaufler-ca.com>
Date: Tue, 24 Oct 2017 18:52:31 -0700
Subject: Audit: remove unused audit_log_secctx function

The function audit_log_secctx() is unused in the upstream kernel.
All it does is wrap another function that doesn't need wrapping.
It claims to give you the SELinux context, but that is not true if
you are using a different security module.

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
Reviewed-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/linux/audit.h |  8 --------
 kernel/audit.c        | 26 --------------------------
 2 files changed, 34 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 2150bdccfbab..fa1b068d911d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -149,12 +149,6 @@ extern void		    audit_log_key(struct audit_buffer *ab,
 extern void		    audit_log_link_denied(const char *operation,
 						  const struct path *link);
 extern void		    audit_log_lost(const char *message);
-#ifdef CONFIG_SECURITY
-extern void 		    audit_log_secctx(struct audit_buffer *ab, u32 secid);
-#else
-static inline void	    audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
-#endif
 
 extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab,
@@ -203,8 +197,6 @@ static inline void audit_log_key(struct audit_buffer *ab, char *key)
 static inline void audit_log_link_denied(const char *string,
 					 const struct path *link)
 { }
-static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{ }
 static inline int audit_log_task_context(struct audit_buffer *ab)
 {
 	return 0;
diff --git a/kernel/audit.c b/kernel/audit.c
index 64e1d0ec19de..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2345,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	}
 }
 
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
-	u32 len;
-	char *secctx;
-
-	if (security_secid_to_secctx(secid, &secctx, &len)) {
-		audit_panic("Cannot convert secid to context");
-	} else {
-		audit_log_format(ab, " obj=%s", secctx);
-		security_release_secctx(secctx, len);
-	}
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
-- 
cgit v1.3-14-g43fede


From 42d5e37654e4cdb9fb2e2f3ab30045fee35c42d8 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 23 Aug 2017 07:03:39 -0400
Subject: audit: filter PATH records keyed on filesystem magic

Tracefs or debugfs were causing hundreds to thousands of PATH records to
be associated with the init_module and finit_module SYSCALL records on a
few modules when the following rule was in place for startup:
	-a always,exit -F arch=x86_64 -S init_module -F key=mod-load

Provide a method to ignore these large number of PATH records from
overwhelming the logs if they are not of interest.  Introduce a new
filter list "AUDIT_FILTER_FS", with a new field type AUDIT_FSTYPE,
which keys off the filesystem 4-octet hexadecimal magic identifier to
filter specific filesystem PATH records.

An example rule would look like:
	-a never,filesystem -F fstype=0x74726163 -F key=ignore_tracefs
	-a never,filesystem -F fstype=0x64626720 -F key=ignore_debugfs

Arguably the better way to address this issue is to disable tracefs and
debugfs on boot from production systems.

See: https://github.com/linux-audit/audit-kernel/issues/16
See: https://github.com/linux-audit/audit-userspace/issues/8
Test case: https://github.com/linux-audit/audit-testsuite/issues/42

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
[PM: fixed the whitespace damage in kernel/auditsc.c]
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 include/uapi/linux/audit.h |  8 ++++++--
 kernel/auditfilter.c       | 39 ++++++++++++++++++++++++++++++++-------
 kernel/auditsc.c           | 23 +++++++++++++++++++++++
 3 files changed, 61 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 0714a66f0e0c..be711341938e 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -155,8 +155,9 @@
 #define AUDIT_FILTER_WATCH	0x03	/* Apply rule to file system watches */
 #define AUDIT_FILTER_EXIT	0x04	/* Apply rule at syscall exit */
 #define AUDIT_FILTER_TYPE	0x05	/* Apply rule at audit_log_start */
+#define AUDIT_FILTER_FS		0x06	/* Apply rule at __audit_inode_child */
 
-#define AUDIT_NR_FILTERS	6
+#define AUDIT_NR_FILTERS	7
 
 #define AUDIT_FILTER_PREPEND	0x10	/* Prepend to front of list */
 
@@ -256,6 +257,7 @@
 #define AUDIT_OBJ_LEV_HIGH	23
 #define AUDIT_LOGINUID_SET	24
 #define AUDIT_SESSIONID	25	/* Session ID */
+#define AUDIT_FSTYPE	26	/* FileSystem Type */
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
@@ -335,13 +337,15 @@ enum {
 #define AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND	0x00000008
 #define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER	0x00000010
 #define AUDIT_FEATURE_BITMAP_LOST_RESET		0x00000020
+#define AUDIT_FEATURE_BITMAP_FILTER_FS		0x00000040
 
 #define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
 				  AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
 				  AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH | \
 				  AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND | \
 				  AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
-				  AUDIT_FEATURE_BITMAP_LOST_RESET)
+				  AUDIT_FEATURE_BITMAP_LOST_RESET | \
+				  AUDIT_FEATURE_BITMAP_FILTER_FS)
 
 /* deprecated: AUDIT_VERSION_* */
 #define AUDIT_VERSION_LATEST 		AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
 	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+	LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_rules_list[3]),
 	LIST_HEAD_INIT(audit_rules_list[4]),
 	LIST_HEAD_INIT(audit_rules_list[5]),
+	LIST_HEAD_INIT(audit_rules_list[6]),
 };
 
 DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
 #endif
 	case AUDIT_FILTER_USER:
 	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		    entry->rule.listnr != AUDIT_FILTER_USER)
 			return -EINVAL;
 		break;
+	case AUDIT_FSTYPE:
+		if (entry->rule.listnr != AUDIT_FILTER_FS)
+			return -EINVAL;
+		break;
+	}
+
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_FS:
+		switch(f->type) {
+		case AUDIT_FSTYPE:
+		case AUDIT_FILTERKEY:
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 			return -EINVAL;
 	/* FALL THROUGH */
 	case AUDIT_ARCH:
+	case AUDIT_FSTYPE:
 		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
 		break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aac1a41f82bd..c9bb29e17335 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+	struct audit_entry *e;
+	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+	int i;
 
 	if (!context->in_syscall)
 		return;
 
+	rcu_read_lock();
+	if (!list_empty(list)) {
+		list_for_each_entry_rcu(e, list, list) {
+			for (i = 0; i < e->rule.field_count; i++) {
+				struct audit_field *f = &e->rule.fields[i];
+
+				if (f->type == AUDIT_FSTYPE) {
+					if (audit_comparator(parent->i_sb->s_magic,
+					    f->op, f->val)) {
+						if (e->rule.action == AUDIT_NEVER) {
+							rcu_read_unlock();
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
 	if (inode)
 		handle_one(inode);
 
-- 
cgit v1.3-14-g43fede


From 1f2cac107c591c24b60b115d6050adc213d10fc0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:13:48 -0700
Subject: blktrace: fix unlocked access to init/start-stop/teardown

sg.c calls into the blktrace functions without holding the proper queue
mutex for doing setup, start/stop, or teardown.

Add internal unlocked variants, and export the ones that do the proper
locking.

Fixes: 6da127ad0918 ("blktrace: Add blktrace ioctls to SCSI generic devices")
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 58 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce..ea57dd94b2b2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -336,7 +336,7 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 		blk_unregister_tracepoints();
 }
 
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
@@ -349,6 +349,17 @@ int blk_trace_remove(struct request_queue *q)
 
 	return 0;
 }
+
+int blk_trace_remove(struct request_queue *q)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_remove(q);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -550,9 +561,8 @@ err:
 	return ret;
 }
 
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    struct block_device *bdev,
-		    char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			     struct block_device *bdev, char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	int ret;
@@ -571,6 +581,19 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 	return 0;
 }
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
+		    char __user *arg)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_setup(q, name, dev, bdev, arg);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -607,7 +630,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 }
 #endif
 
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 	struct blk_trace *bt = q->blk_trace;
@@ -646,6 +669,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
 
 	return ret;
 }
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_startstop(q, start);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
 /*
@@ -676,7 +710,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
@@ -687,10 +721,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
+		ret = __blk_trace_startstop(q, start);
 		break;
 	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
+		ret = __blk_trace_remove(q);
 		break;
 	default:
 		ret = -ENOTTY;
@@ -708,10 +742,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
+	mutex_lock(&q->blk_trace_mutex);
+
 	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
+		__blk_trace_startstop(q, 0);
+		__blk_trace_remove(q);
 	}
+
+	mutex_unlock(&q->blk_trace_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
-- 
cgit v1.3-14-g43fede


From a6da0024ffc19e0d47712bb5ca4fd083f76b07df Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 5 Nov 2017 09:16:09 -0700
Subject: blktrace: fix unlocked registration of tracepoints

We need to ensure that tracepoints are registered and unregistered
with the users of them. The existing atomic count isn't enough for
that. Add a lock around the tracepoints, so we serialize access
to them.

This fixes cases where we have multiple users setting up and
tearing down tracepoints, like this:

CPU: 0 PID: 2995 Comm: syzkaller857118 Not tainted
4.14.0-rc5-next-20171018+ #36
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:16 [inline]
  dump_stack+0x194/0x257 lib/dump_stack.c:52
  panic+0x1e4/0x41c kernel/panic.c:183
  __warn+0x1c4/0x1e0 kernel/panic.c:546
  report_bug+0x211/0x2d0 lib/bug.c:183
  fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177
  do_trap_no_signal arch/x86/kernel/traps.c:211 [inline]
  do_trap+0x260/0x390 arch/x86/kernel/traps.c:260
  do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297
  do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310
  invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
RIP: 0010:tracepoint_add_func kernel/tracepoint.c:210 [inline]
RIP: 0010:tracepoint_probe_register_prio+0x397/0x9a0 kernel/tracepoint.c:283
RSP: 0018:ffff8801d1d1f6c0 EFLAGS: 00010293
RAX: ffff8801d22e8540 RBX: 00000000ffffffef RCX: ffffffff81710f07
RDX: 0000000000000000 RSI: ffffffff85b679c0 RDI: ffff8801d5f19818
RBP: ffff8801d1d1f7c8 R08: ffffffff81710c10 R09: 0000000000000004
R10: ffff8801d1d1f6b0 R11: 0000000000000003 R12: ffffffff817597f0
R13: 0000000000000000 R14: 00000000ffffffff R15: ffff8801d1d1f7a0
  tracepoint_probe_register+0x2a/0x40 kernel/tracepoint.c:304
  register_trace_block_rq_insert include/trace/events/block.h:191 [inline]
  blk_register_tracepoints+0x1e/0x2f0 kernel/trace/blktrace.c:1043
  do_blk_trace_setup+0xa10/0xcf0 kernel/trace/blktrace.c:542
  blk_trace_setup+0xbd/0x180 kernel/trace/blktrace.c:564
  sg_ioctl+0xc71/0x2d90 drivers/scsi/sg.c:1089
  vfs_ioctl fs/ioctl.c:45 [inline]
  do_vfs_ioctl+0x1b1/0x1520 fs/ioctl.c:685
  SYSC_ioctl fs/ioctl.c:700 [inline]
  SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
  entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x444339
RSP: 002b:00007ffe05bb5b18 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000006d66c0 RCX: 0000000000444339
RDX: 000000002084cf90 RSI: 00000000c0481273 RDI: 0000000000000009
RBP: 0000000000000082 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000206 R12: ffffffffffffffff
R13: 00000000c0481273 R14: 0000000000000000 R15: 0000000000000000

since we can now run these in parallel. Ensure that the exported helpers
for doing this are grabbing the queue trace mutex.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/trace/blktrace.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea57dd94b2b2..206e0e2ace53 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
 
 static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
@@ -329,11 +330,26 @@ static void blk_trace_free(struct blk_trace *bt)
 	kfree(bt);
 }
 
+static void get_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (++blk_probes_ref == 1)
+		blk_register_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (!--blk_probes_ref)
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	blk_trace_free(bt);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
+	put_probe_ref();
 }
 
 static int __blk_trace_remove(struct request_queue *q)
@@ -549,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto err;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 
 	ret = 0;
 err:
@@ -1596,9 +1611,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-
+	put_probe_ref();
 	blk_trace_free(bt);
 	return 0;
 }
@@ -1629,8 +1642,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto free_bt;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 	return 0;
 
 free_bt:
-- 
cgit v1.3-14-g43fede


From e10237cc76ef9a4066a84aa2cc710bfd708cc341 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Tue, 7 Nov 2017 11:09:50 -0800
Subject: kthread: zero the kthread data structure

kthread() could bail out early before we initialize blkcg_css (if the
kthread is killed very early. Please see xchg() statement in kthread()),
which confuses free_kthread_struct. Instead of moving the blkcg_css
initialization early, we simply zero the whole 'self' data structure,
which doesn't sound much overhead.

Reported-by: syzbot <syzkaller@googlegroups.com>
Fixes: 05e3db95ebfc ("kthread: add a mechanism to store cgroup info")
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 kernel/kthread.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index f87cd8b4eb2a..8dbe2454cb1d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -204,7 +204,7 @@ static int kthread(void *_create)
 	struct kthread *self;
 	int ret;
 
-	self = kmalloc(sizeof(*self), GFP_KERNEL);
+	self = kzalloc(sizeof(*self), GFP_KERNEL);
 	set_kthread_struct(self);
 
 	/* If user was SIGKILLed, I release the structure. */
@@ -220,13 +220,9 @@ static int kthread(void *_create)
 		do_exit(-ENOMEM);
 	}
 
-	self->flags = 0;
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
-#ifdef CONFIG_BLK_CGROUP
-	self->blkcg_css = NULL;
-#endif
 	current->vfork_done = &self->exited;
 
 	/* OK, tell user we're spawned, wait for stop or wakeup */
-- 
cgit v1.3-14-g43fede


From dd0bb688eaa241b5655d396d45366cba9225aed9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 7 Nov 2017 15:28:42 -0500
Subject: bpf: add a bpf_override_function helper

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                     |  3 +++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 ++++
 arch/x86/include/asm/ptrace.h    |  5 +++++
 arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++++++
 include/linux/filter.h           |  3 ++-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++++-
 kernel/bpf/core.c                |  3 +++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++++
 kernel/trace/Kconfig             | 11 +++++++++++
 kernel/trace/bpf_trace.c         | 35 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 40 +++++++++++++++++++++++++++++++++-------
 kernel/trace/trace_probe.h       |  6 ++++++
 15 files changed, 133 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 057370a0ac4e..6e8520f09bc1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fdb23313dd5..51458c1a0b4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,6 +153,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 6cf65437b5e5..c6c3b1f4306a 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..2370bb0149cc 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 041f7b6dfa0f..3c455bf490cb 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0cd02ff4ae30..eaec066f99e8 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,7 +459,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 84014ecfa67f..17e5e820a84c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,6 +523,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..adb66f78b674 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8a6c37762330..271daad31f37 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,6 +1326,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..bc464b8ec91e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,6 +4357,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 42d24bd64ea4..ac240d31b5bf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..9dc0deeaad2b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,6 +518,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..1865b0d4cdeb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
+				    current->comm, task_pid_nr(current));
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +797,10 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/* Kprobe override only works for ftrace based kprobes. */
+	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index abf92e478cfb..8e3c9ec1faf7 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,12 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1177,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1186,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1217,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,6 +1300,7 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1282,9 +1308,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+		ret = kprobe_perf_func(tk, regs);
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..adbb3f7d1fb5 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,6 +253,7 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -278,6 +279,11 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
cgit v1.3-14-g43fede


From f3edacbd697f94a743fff1a3d26910ab99948ba7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sat, 11 Nov 2017 18:24:55 +0900
Subject: bpf: Revert bpf_overrid_function() helper changes.

NACK'd by x86 maintainer.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/Kconfig                              |  3 ---
 arch/x86/Kconfig                          |  1 -
 arch/x86/include/asm/kprobes.h            |  4 ----
 arch/x86/include/asm/ptrace.h             |  5 ----
 arch/x86/kernel/kprobes/ftrace.c          | 14 -----------
 include/linux/filter.h                    |  3 +--
 include/linux/trace_events.h              |  1 -
 include/uapi/linux/bpf.h                  |  7 +-----
 kernel/bpf/core.c                         |  3 ---
 kernel/bpf/verifier.c                     |  2 --
 kernel/events/core.c                      |  7 ------
 kernel/trace/Kconfig                      | 11 ---------
 kernel/trace/bpf_trace.c                  | 35 ---------------------------
 kernel/trace/trace_kprobe.c               | 40 ++++++-------------------------
 kernel/trace/trace_probe.h                |  6 -----
 samples/bpf/Makefile                      |  4 ----
 samples/bpf/test_override_return.sh       | 15 ------------
 samples/bpf/tracex7_kern.c                | 16 -------------
 samples/bpf/tracex7_user.c                | 28 ----------------------
 tools/include/uapi/linux/bpf.h            |  7 +-----
 tools/testing/selftests/bpf/bpf_helpers.h |  3 +--
 21 files changed, 11 insertions(+), 204 deletions(-)
 delete mode 100755 samples/bpf/test_override_return.sh
 delete mode 100644 samples/bpf/tracex7_kern.c
 delete mode 100644 samples/bpf/tracex7_user.c

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 6e8520f09bc1..057370a0ac4e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,9 +196,6 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
-config HAVE_KPROBE_OVERRIDE
-	bool
-
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 51458c1a0b4a..2fdb23313dd5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -153,7 +153,6 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
-	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index c6c3b1f4306a..6cf65437b5e5 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,10 +67,6 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
-#ifdef CONFIG_KPROBES_ON_FTRACE
-extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
-#endif
-
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 2370bb0149cc..c0e3c45cf6ab 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,11 +109,6 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
-static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
-{
-	regs->ax = rc;
-}
-
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 3c455bf490cb..041f7b6dfa0f 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
-
-asmlinkage void override_func(void);
-asm(
-	".type override_func, @function\n"
-	"override_func:\n"
-	"	ret\n"
-	".size override_func, .-override_func\n"
-);
-
-void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
-{
-	regs->ip = (unsigned long)&override_func;
-}
-NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index eaec066f99e8..0cd02ff4ae30 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -459,8 +459,7 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1,	/* Do we need dst entry? */
-				kprobe_override:1; /* Do we override a kprobe? */
+				dst_needed:1;	/* Do we need dst entry? */
 	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 17e5e820a84c..84014ecfa67f 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -523,7 +523,6 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
-DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 271daad31f37..8a6c37762330 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1326,9 +1326,6 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
-	if (fp->kprobe_override)
-		return false;
-
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index bc464b8ec91e..4a942e2e753d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4357,8 +4357,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
-		if (insn->imm == BPF_FUNC_override_return)
-			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ac240d31b5bf..42d24bd64ea4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,13 +8171,6 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
-	/* Kprobe override only works for kprobes, not uprobes. */
-	if (prog->kprobe_override &&
-	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
-		bpf_prog_put(prog);
-		return -EINVAL;
-	}
-
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 9dc0deeaad2b..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -518,17 +518,6 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
-config BPF_KPROBE_OVERRIDE
-	bool "Enable BPF programs to override a kprobed function"
-	depends on BPF_EVENTS
-	depends on KPROBES_ON_FTRACE
-	depends on HAVE_KPROBE_OVERRIDE
-	depends on DYNAMIC_FTRACE_WITH_REGS
-	default n
-	help
-	 Allows BPF to override the execution of a probed function and
-	 set a different return value.  This is used for error injection.
-
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1865b0d4cdeb..506efe6e8ed9 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,10 +13,6 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
-#include <linux/kprobes.h>
-#include <asm/kprobes.h>
-
-#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -80,29 +76,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
-#ifdef CONFIG_BPF_KPROBE_OVERRIDE
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	__this_cpu_write(bpf_kprobe_override, 1);
-	regs_set_return_value(regs, rc);
-	arch_ftrace_kprobe_override_function(regs);
-	return 0;
-}
-#else
-BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
-{
-	return -EINVAL;
-}
-#endif
-
-static const struct bpf_func_proto bpf_override_return_proto = {
-	.func		= bpf_override_return,
-	.gpl_only	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_ANYTHING,
-};
-
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -578,10 +551,6 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
-	case BPF_FUNC_override_return:
-		pr_warn_ratelimited("%s[%d] is installing a program with bpf_override_return helper that may cause unexpected behavior!",
-				    current->comm, task_pid_nr(current));
-		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -797,10 +766,6 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
-	/* Kprobe override only works for ftrace based kprobes. */
-	if (prog->kprobe_override && !trace_kprobe_ftrace(event->tp_event))
-		return -EINVAL;
-
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8e3c9ec1faf7..abf92e478cfb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,7 +42,6 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
-DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -88,12 +87,6 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
-int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
-	return kprobe_ftrace(&tk->rp.kp);
-}
-
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1177,7 +1170,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static int
+static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1186,29 +1179,12 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call)) {
-		int ret;
-
-		ret = trace_call_bpf(call, regs);
-
-		/*
-		 * We need to check and see if we modified the pc of the
-		 * pt_regs, and if so clear the kprobe and return 1 so that we
-		 * don't do the instruction skipping.  Also reset our state so
-		 * we are clean the next pass through.
-		 */
-		if (__this_cpu_read(bpf_kprobe_override)) {
-			__this_cpu_write(bpf_kprobe_override, 0);
-			reset_current_kprobe();
-			return 1;
-		}
-		if (!ret)
-			return 0;
-	}
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
+		return;
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return 0;
+		return;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1217,14 +1193,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return 0;
+		return;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL, NULL);
-	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1300,7 +1275,6 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
-	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
@@ -1308,9 +1282,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
 	if (tk->tp.flags & TP_FLAG_PROFILE)
-		ret = kprobe_perf_func(tk, regs);
+		kprobe_perf_func(tk, regs);
 #endif
-	return ret;
+	return 0;	/* We don't tweek kernel, so just return 0 */
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index adbb3f7d1fb5..903273c93e61 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -253,7 +253,6 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
-int trace_kprobe_ftrace(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -279,11 +278,6 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
-
-static inline int trace_kprobe_ftrace(struct trace_event_call *call)
-{
-	return 0;
-}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 87db0f9a4c15..3b4945c1eab0 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,7 +15,6 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
-hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -62,7 +61,6 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
-tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -106,7 +104,6 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
-always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -161,7 +158,6 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
-HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
deleted file mode 100755
index e68b9ee6814b..000000000000
--- a/samples/bpf/test_override_return.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-rm -f testfile.img
-dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
-DEVICE=$(losetup --show -f testfile.img)
-mkfs.btrfs -f $DEVICE
-mkdir tmpmnt
-./tracex7 $DEVICE
-if [ $? -eq 0 ]
-then
-	echo "SUCCESS!"
-else
-	echo "FAILED!"
-fi
-losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
deleted file mode 100644
index 1ab308a43e0f..000000000000
--- a/samples/bpf/tracex7_kern.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <uapi/linux/ptrace.h>
-#include <uapi/linux/bpf.h>
-#include <linux/version.h>
-#include "bpf_helpers.h"
-
-SEC("kprobe/open_ctree")
-int bpf_prog1(struct pt_regs *ctx)
-{
-	unsigned long rc = -12;
-
-	bpf_override_return(ctx, rc);
-	return 0;
-}
-
-char _license[] SEC("license") = "GPL";
-u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
deleted file mode 100644
index 8a52ac492e8b..000000000000
--- a/samples/bpf/tracex7_user.c
+++ /dev/null
@@ -1,28 +0,0 @@
-#define _GNU_SOURCE
-
-#include <stdio.h>
-#include <linux/bpf.h>
-#include <unistd.h>
-#include "libbpf.h"
-#include "bpf_load.h"
-
-int main(int argc, char **argv)
-{
-	FILE *f;
-	char filename[256];
-	char command[256];
-	int ret;
-
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
-
-	if (load_bpf_file(filename)) {
-		printf("%s", bpf_log_buf);
-		return 1;
-	}
-
-	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
-	f = popen(command, "r");
-	ret = pclose(f);
-
-	return ret ? 0 : 1;
-}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index adb66f78b674..e880ae6434ee 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,10 +677,6 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
- *
- * int bpf_override_return(pt_regs, rc)
- *	@pt_regs: pointer to struct pt_regs
- *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -740,8 +736,7 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),			\
-	FN(override_return),
+	FN(getsockopt),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 33cb00e46c49..fd9a17fa8a8b 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,8 +82,7 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-static int (*bpf_override_return)(void *ctx, unsigned long rc) =
-	(void *) BPF_FUNC_override_return;
+
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
cgit v1.3-14-g43fede


From d00a08cf9ee986ad6689ce8c6fd176aff679c106 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 12 Nov 2017 13:02:51 +0100
Subject: irq/work: Use llist_for_each_entry_safe

The llist_for_each_entry() loop in irq_work_run_list() is unsafe because
once the works PENDING bit is cleared it can be requeued on another CPU.

Use llist_for_each_entry_safe() instead.

Fixes: 16c0890dc66d ("irq/work: Don't reinvent the wheel but use existing llist API")
Reported-by:Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Petri Latvala <petri.latvala@intel.com>
Link: http://lkml.kernel.org/r/151027307351.14762.4611888896020658384@mail.alporthouse.com
---
 kernel/irq_work.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e2ebe8c71e8f..6647b33f7eb0 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -128,9 +128,9 @@ bool irq_work_needs_cpu(void)
 
 static void irq_work_run_list(struct llist_head *list)
 {
-	unsigned long flags;
-	struct irq_work *work;
+	struct irq_work *work, *tmp;
 	struct llist_node *llnode;
+	unsigned long flags;
 
 	BUG_ON(!irqs_disabled());
 
@@ -138,7 +138,7 @@ static void irq_work_run_list(struct llist_head *list)
 		return;
 
 	llnode = llist_del_all(list);
-	llist_for_each_entry(work, llnode, llnode) {
+	llist_for_each_entry_safe(work, tmp, llnode, llnode) {
 		/*
 		 * Clear the PENDING bit, after this point the @work
 		 * can be re-used.
-- 
cgit v1.3-14-g43fede


From df27067e6040b51188184876253d93da002433aa Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 10 Nov 2017 16:25:04 +0100
Subject: pstore: Use ktime_get_real_fast_ns() instead of __getnstimeofday()

__getnstimeofday() is a rather odd interface, with a number of quirks:

- The caller may come from NMI context, but the implementation is not NMI safe,
  one way to get there from NMI is

      NMI handler:
        something bad
          panic()
            kmsg_dump()
              pstore_dump()
                 pstore_record_init()
                   __getnstimeofday()

- The calling conventions are different from any other timekeeping functions,
  to deal with returning an error code during suspended timekeeping.

Address the above issues by using a completely different method to get the
time: ktime_get_real_fast_ns() is NMI safe and has a reasonable behavior
when timekeeping is suspended: it returns the time at which it got
suspended. As Thomas Gleixner explained, this is safe, as
ktime_get_real_fast_ns() does not call into the clocksource driver that
might be suspended.

The result can easily be transformed into a timespec structure. Since
ktime_get_real_fast_ns() was not exported to modules, add the export.

The pstore behavior for the suspended case changes slightly, as it now
stores the timestamp at which timekeeping was suspended instead of storing
a zero timestamp.

This change is not addressing y2038-safety, that's subject to a more
complex follow up patch.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Colin Cross <ccross@android.com>
Link: https://lkml.kernel.org/r/20171110152530.1926955-1-arnd@arndb.de
---
 fs/pstore/platform.c      | 5 +----
 kernel/time/timekeeping.c | 1 +
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index ec7199e859d2..086e491faf04 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -482,10 +482,7 @@ void pstore_record_init(struct pstore_record *record,
 	record->psi = psinfo;
 
 	/* Report zeroed timestamp if called before timekeeping has resumed. */
-	if (__getnstimeofday(&record->time)) {
-		record->time.tv_sec = 0;
-		record->time.tv_nsec = 0;
-	}
+	record->time = ns_to_timespec(ktime_get_real_fast_ns());
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 353f7bd1eeb0..198afa78bf69 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -528,6 +528,7 @@ u64 ktime_get_real_fast_ns(void)
 {
 	return __ktime_get_real_fast_ns(&tk_fast_mono);
 }
+EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
 
 /**
  * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
-- 
cgit v1.3-14-g43fede


From b24591e2fcf852ad7ad2ccf745c8220bf378d312 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 9 Nov 2017 12:35:07 +0000
Subject: timers: Add a function to start/reduce a timer

Add a function, similar to mod_timer(), that will start a timer if it isn't
running and will modify it if it is running and has an expiry time longer
than the new time.  If the timer is running with an expiry time that's the
same or sooner, no change is made.

The function looks like:

	int timer_reduce(struct timer_list *timer, unsigned long expires);

This can be used by code such as networking code to make it easier to share
a timer for multiple timeouts.  For instance, in upcoming AF_RXRPC code,
the rxrpc_call struct will maintain a number of timeouts:

	unsigned long	ack_at;
	unsigned long	resend_at;
	unsigned long	ping_at;
	unsigned long	expect_rx_by;
	unsigned long	expect_req_by;
	unsigned long	expect_term_by;

each of which is set independently of the others.  With timer reduction
available, when the code needs to set one of the timeouts, it only needs to
look at that timeout and then call timer_reduce() to modify the timer,
starting it or bringing it forward if necessary.  There is no need to refer
to the other timeouts to see which is earliest and no need to take any lock
other than, potentially, the timer lock inside timer_reduce().

Note, that this does not protect against concurrent invocations of any of
the timer functions.

As an example, the expect_rx_by timeout above, which terminates a call if
we don't get a packet from the server within a certain time window, would
be set something like this:

	unsigned long now = jiffies;
	unsigned long expect_rx_by = now + packet_receive_timeout;
	WRITE_ONCE(call->expect_rx_by, expect_rx_by);
	timer_reduce(&call->timer, expect_rx_by);

The timer service code (which might, say, be in a work function) would then
check all the timeouts to see which, if any, had triggered, deal with
those:

	t = READ_ONCE(call->ack_at);
	if (time_after_eq(now, t)) {
		cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET);
		set_bit(RXRPC_CALL_EV_ACK, &call->events);
	}

and then restart the timer if necessary by finding the soonest timeout that
hasn't yet passed and then calling timer_reduce().

The disadvantage of doing things this way rather than comparing the timers
each time and calling mod_timer() is that you *will* take timer events
unless you can finish what you're doing and delete the timer in time.

The advantage of doing things this way is that you don't need to use a lock
to work out when the next timer should be set, other than the timer's own
lock - which you might not have to take.

[ tglx: Fixed weird formatting and adopted it to pending changes ]

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: keyrings@vger.kernel.org
Cc: linux-afs@lists.infradead.org
Link: https://lkml.kernel.org/r/151023090769.23050.1801643667223880753.stgit@warthog.procyon.org.uk
---
 include/linux/timer.h |  1 +
 kernel/time/timer.c   | 45 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 39 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 9f8895decb82..37b5e2f74d21 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -203,6 +203,7 @@ extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
+extern int timer_reduce(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index fbb1f85327bf..af0b8bae4502 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -929,8 +929,11 @@ static struct timer_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
+#define MOD_TIMER_PENDING_ONLY		0x01
+#define MOD_TIMER_REDUCE		0x02
+
 static inline int
-__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
+__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
 {
 	struct timer_base *base, *new_base;
 	unsigned int idx = UINT_MAX;
@@ -950,7 +953,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 * larger granularity than you would get from adding a new
 		 * timer with this expiry.
 		 */
-		if (timer->expires == expires)
+		long diff = timer->expires - expires;
+
+		if (!diff)
+			return 1;
+		if (options & MOD_TIMER_REDUCE && diff <= 0)
 			return 1;
 
 		/*
@@ -962,6 +969,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		base = lock_timer_base(timer, &flags);
 		forward_timer_base(base);
 
+		if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
+		    time_before_eq(timer->expires, expires)) {
+			ret = 1;
+			goto out_unlock;
+		}
+
 		clk = base->clk;
 		idx = calc_wheel_index(expires, clk);
 
@@ -971,7 +984,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 		 * subsequent call will exit in the expires check above.
 		 */
 		if (idx == timer_get_idx(timer)) {
-			timer->expires = expires;
+			if (!(options & MOD_TIMER_REDUCE))
+				timer->expires = expires;
+			else if (time_after(timer->expires, expires))
+				timer->expires = expires;
 			ret = 1;
 			goto out_unlock;
 		}
@@ -981,7 +997,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 	}
 
 	ret = detach_if_pending(timer, base, false);
-	if (!ret && pending_only)
+	if (!ret && (options & MOD_TIMER_PENDING_ONLY))
 		goto out_unlock;
 
 	debug_activate(timer, expires);
@@ -1042,7 +1058,7 @@ out_unlock:
  */
 int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, true);
+	return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
 }
 EXPORT_SYMBOL(mod_timer_pending);
 
@@ -1068,10 +1084,25 @@ EXPORT_SYMBOL(mod_timer_pending);
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	return __mod_timer(timer, expires, false);
+	return __mod_timer(timer, expires, 0);
 }
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * timer_reduce - Modify a timer's timeout if it would reduce the timeout
+ * @timer:	The timer to be modified
+ * @expires:	New timeout in jiffies
+ *
+ * timer_reduce() is very similar to mod_timer(), except that it will only
+ * modify a running timer if that would reduce the expiration time (it will
+ * start a timer that isn't running).
+ */
+int timer_reduce(struct timer_list *timer, unsigned long expires)
+{
+	return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
+}
+EXPORT_SYMBOL(timer_reduce);
+
 /**
  * add_timer - start a timer
  * @timer: the timer to be added
@@ -1754,7 +1785,7 @@ signed long __sched schedule_timeout(signed long timeout)
 
 	timer.task = current;
 	timer_setup_on_stack(&timer.timer, process_timeout, 0);
-	__mod_timer(&timer.timer, expire, false);
+	__mod_timer(&timer.timer, expire, 0);
 	schedule();
 	del_singleshot_timer_sync(&timer.timer);
 
-- 
cgit v1.3-14-g43fede


From 6714796edcce27f7a1845e2f79783cd51bb4799b Mon Sep 17 00:00:00 2001
From: Wen Yaxng <wen.yang99@zte.com.cn>
Date: Wed, 8 Nov 2017 09:55:03 +0800
Subject: genirq/proc: Return proper error code when irq_set_affinity() fails

write_irq_affinity() returns the number of written bytes, which means
success, unconditionally whether the actual irq_set_affinity() call
succeeded or not.

Add proper error handling and pass the error code returned from
irq_set_affinity() back to user space in case of failure.

[ tglx: Fixed coding style and massaged changelog ]

Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jiang Biao <jiang.biao2@zte.com.cn>
Cc: zhong.weidong@zte.com.cn
Link: https://lkml.kernel.org/r/1510106103-184761-1-git-send-email-wen.yang99@zte.com.cn
---
 kernel/irq/proc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6376b4a598d3..29d6f520a9fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -154,8 +154,9 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 		 */
 		err = irq_select_affinity_usr(irq) ? -EINVAL : count;
 	} else {
-		irq_set_affinity(irq, new_value);
-		err = count;
+		err = irq_set_affinity(irq, new_value);
+		if (!err)
+			err = count;
 	}
 
 free_cpumask:
-- 
cgit v1.3-14-g43fede


From 306eb5a38dfc1a4c8a5c910c9844da6a97f2b9a4 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Sun, 12 Nov 2017 22:29:03 +0100
Subject: irqdomain: Drop pointless NULL check in virq_debug_show_one

data has been already derefenced unconditionally, so it's pointless to do a
NULL pointer check on it afterwards. Drop it.

[ tglx: Depersonify changelog. ]

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: https://lkml.kernel.org/r/20171112212904.28574-1-linux@rasmusvillemoes.dk
---
 kernel/irq/irqdomain.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index fbbf34293b17..4f4f60015e8a 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -921,8 +921,7 @@ static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc)
 		chip = irq_data_get_irq_chip(data);
 		seq_printf(m, "%-15s  ", (chip && chip->name) ? chip->name : "none");
 
-		seq_printf(m, data ? "0x%p  " : "  %p  ",
-			   irq_data_get_irq_chip_data(data));
+		seq_printf(m, "0x%p  ", irq_data_get_irq_chip_data(data));
 
 		seq_printf(m, "   %c    ", (desc->action && desc->action->handler) ? '*' : ' ');
 		direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq);
-- 
cgit v1.3-14-g43fede


From ffc661c99f621152d5fdcf53f9df0d48c326318b Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Mon, 30 Oct 2017 22:35:47 +0100
Subject: genirq: Fix type of shifting literal 1 in __setup_irq()

If ffz() ever returns a value >= 31 then the following shift is undefined
behaviour because the literal 1 which gets shifted is treated as signed
integer.

In practice, the bug is probably harmless, since the first undefined shift
count is 31 which results - ignoring UB - in (int)(0x80000000). This gets
sign extended so bit 32-63 will be set as well and all subsequent
__setup_irq() calls would just end up hitting the -EBUSY branch.

However, a sufficiently aggressive optimizer may use the UB of 1<<31
to decide that doesn't happen, and hence elide the sign-extension
code, so that subsequent calls can indeed get ffz > 31.

In any case, the right thing to do is to make the literal 1UL.

[ tglx: For this to happen a single interrupt would have to be shared by 32
  	devices. Hardware like that does not exist and would have way more
  	problems than that. ]

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20171030213548.16831-1-linux@rasmusvillemoes.dk
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c65f282cbc9a..6a1bf9dc7a6a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1289,7 +1289,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * thread_mask assigned. See the loop above which or's
 		 * all existing action->thread_mask bits.
 		 */
-		new->thread_mask = 1 << ffz(thread_mask);
+		new->thread_mask = 1UL << ffz(thread_mask);
 
 	} else if (new->handler == irq_default_primary_handler &&
 		   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
-- 
cgit v1.3-14-g43fede


From 277642dcca765a1955d4c753a5a315ff7f2eb09d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 12 Nov 2017 17:00:53 -0800
Subject: modules: make sysfs attribute files readable by owner only

This code goes back to the historical bitkeeper tree commit 3f7b0672086
("Module section offsets in /sys/module"), where Jonathan Corbet wanted
to show people how to debug loadable modules.

See

    https://lwn.net/Articles/88052/

from June 2004.

To expose the required load address information, Jonathan added the
sections subdirectory for every module in /sys/modules, and made them
S_IRUGO - readable by everybody.

It was a more innocent time, plus those S_IRxxx macro names are a lot
more confusing than the octal numbers are, so maybe it wasn't even
intentional.  But here we are, thirteen years later, and I'll just change
it to S_IRUSR instead.

Let's see if anybody even notices.

Cc: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..fdb3a6aca363 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1516,7 +1516,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
 		sattr->mattr.show = module_sect_show;
 		sattr->mattr.store = NULL;
 		sattr->mattr.attr.name = sattr->name;
-		sattr->mattr.attr.mode = S_IRUGO;
+		sattr->mattr.attr.mode = S_IRUSR;
 		*(gattr++) = &(sattr++)->mattr.attr;
 	}
 	*gattr = NULL;
-- 
cgit v1.3-14-g43fede


From 516fb7f2e73dcc303fb97fc3593209fcacf2d982 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 12 Nov 2017 18:44:23 -0800
Subject: /proc/module: use the same logic as /proc/kallsyms for address
 exposure

The (alleged) users of the module addresses are the same: kernel
profiling.

So just expose the same helper and format macros, and unify the logic.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kallsyms.h |  8 ++++++++
 kernel/kallsyms.c        |  8 +-------
 kernel/module.c          | 20 ++++++++++++++++++--
 3 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index 11dd93e42580..0a777c5216b1 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -14,6 +14,14 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+/* How and when do we show kallsyms values? */
+extern int kallsyms_show_value(void);
+#ifndef CONFIG_64BIT
+# define KALLSYM_FMT "%08lx"
+#else
+# define KALLSYM_FMT "%016lx"
+#endif
+
 struct module;
 
 #ifdef CONFIG_KALLSYMS
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 51b49ed452e4..1e6ae66c6244 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -581,12 +581,6 @@ static void s_stop(struct seq_file *m, void *p)
 {
 }
 
-#ifndef CONFIG_64BIT
-# define KALLSYM_FMT "%08lx"
-#else
-# define KALLSYM_FMT "%016lx"
-#endif
-
 static int s_show(struct seq_file *m, void *p)
 {
 	unsigned long value;
@@ -640,7 +634,7 @@ static inline int kallsyms_for_perf(void)
  * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
  * block even that).
  */
-static int kallsyms_show_value(void)
+int kallsyms_show_value(void)
 {
 	switch (kptr_restrict) {
 	case 0:
diff --git a/kernel/module.c b/kernel/module.c
index fdb3a6aca363..0122747ba150 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4147,6 +4147,7 @@ static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
+	unsigned long value;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4163,8 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	seq_printf(m, " 0x%pK", mod->core_layout.base);
+	value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+	seq_printf(m, " 0x" KALLSYM_FMT, value);
 
 	/* Taints info */
 	if (mod->taints)
@@ -4184,9 +4186,23 @@ static const struct seq_operations modules_op = {
 	.show	= m_show
 };
 
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
 static int modules_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &modules_op);
+	int err = seq_open(file, &modules_op);
+
+	if (!err) {
+		struct seq_file *m = file->private_data;
+		m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+	}
+
+	return 0;
 }
 
 static const struct file_operations proc_modules_operations = {
-- 
cgit v1.3-14-g43fede


From 8e7df2b5b7f245c9bd11064712db5cb69044a362 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Mon, 13 Nov 2017 07:15:41 +0100
Subject: timer/debug: Change /proc/timer_list from 0444 to 0400

While it uses %pK, there's still few reasons to read this file
as non-root.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timer_list.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0e7f5428a148..0ed768b56c60 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
-- 
cgit v1.3-14-g43fede


From 5e4def20381678ba3ce0a4e117f97e378ecd81bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Nov 2017 15:27:44 +0000
Subject: Pass mode to wait_on_atomic_t() action funcs and provide default
 actions

Make wait_on_atomic_t() pass the TASK_* mode onto its action function as an
extra argument and make it 'unsigned int throughout.

Also, consolidate a bunch of identical action functions into a default
function that can do the appropriate thing for the mode.

Also, change the argument name in the bit_wait*() function declarations to
reflect the fact that it's the mode and not the bit number.

[Peter Z gives this a grudging ACK, but thinks that the whole atomic_t wait
should be done differently, though he's not immediately sure as to how]

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
cc: Ingo Molnar <mingo@kernel.org>
---
 arch/mips/kernel/traps.c                           | 14 +----------
 drivers/gpu/drm/drm_dp_aux_dev.c                   |  8 +------
 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c | 10 ++------
 drivers/media/platform/qcom/venus/hfi.c            |  8 +------
 fs/afs/rxrpc.c                                     |  8 +------
 fs/btrfs/extent-tree.c                             | 27 +++-------------------
 fs/fscache/cookie.c                                |  2 +-
 fs/fscache/internal.h                              |  2 --
 fs/fscache/main.c                                  |  9 --------
 fs/nfs/inode.c                                     |  4 ++--
 fs/nfs/internal.h                                  |  2 +-
 fs/ocfs2/filecheck.c                               |  8 +------
 include/linux/wait_bit.h                           | 15 +++++++-----
 kernel/sched/wait_bit.c                            | 18 +++++++++++----
 14 files changed, 37 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 5669d3b8bd38..5d19ed07e99d 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -1233,18 +1233,6 @@ static int default_cu2_call(struct notifier_block *nfb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static int wait_on_fp_mode_switch(atomic_t *p)
-{
-	/*
-	 * The FP mode for this task is currently being switched. That may
-	 * involve modifications to the format of this tasks FP context which
-	 * make it unsafe to proceed with execution for the moment. Instead,
-	 * schedule some other task.
-	 */
-	schedule();
-	return 0;
-}
-
 static int enable_restore_fp_context(int msa)
 {
 	int err, was_fpu_owner, prior_msa;
@@ -1254,7 +1242,7 @@ static int enable_restore_fp_context(int msa)
 	 * complete before proceeding.
 	 */
 	wait_on_atomic_t(&current->mm->context.fp_mode_switching,
-			 wait_on_fp_mode_switch, TASK_KILLABLE);
+			 atomic_t_wait, TASK_KILLABLE);
 
 	if (!used_math()) {
 		/* First time FP context user. */
diff --git a/drivers/gpu/drm/drm_dp_aux_dev.c b/drivers/gpu/drm/drm_dp_aux_dev.c
index d34e5096887a..053044201e31 100644
--- a/drivers/gpu/drm/drm_dp_aux_dev.c
+++ b/drivers/gpu/drm/drm_dp_aux_dev.c
@@ -263,12 +263,6 @@ static struct drm_dp_aux_dev *drm_dp_aux_dev_get_by_aux(struct drm_dp_aux *aux)
 	return aux_dev;
 }
 
-static int auxdev_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 {
 	struct drm_dp_aux_dev *aux_dev;
@@ -283,7 +277,7 @@ void drm_dp_aux_unregister_devnode(struct drm_dp_aux *aux)
 	mutex_unlock(&aux_idr_mutex);
 
 	atomic_dec(&aux_dev->usecount);
-	wait_on_atomic_t(&aux_dev->usecount, auxdev_wait_atomic_t,
+	wait_on_atomic_t(&aux_dev->usecount, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 
 	minor = aux_dev->index;
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
index 828904b7d468..54fc571b1102 100644
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
@@ -271,13 +271,7 @@ struct igt_wakeup {
 	u32 seqno;
 };
 
-static int wait_atomic(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
-static int wait_atomic_timeout(atomic_t *p)
+static int wait_atomic_timeout(atomic_t *p, unsigned int mode)
 {
 	return schedule_timeout(10 * HZ) ? 0 : -ETIMEDOUT;
 }
@@ -348,7 +342,7 @@ static void igt_wake_all_sync(atomic_t *ready,
 	atomic_set(ready, 0);
 	wake_up_all(wq);
 
-	wait_on_atomic_t(set, wait_atomic, TASK_UNINTERRUPTIBLE);
+	wait_on_atomic_t(set, atomic_t_wait, TASK_UNINTERRUPTIBLE);
 	atomic_set(ready, count);
 	atomic_set(done, count);
 }
diff --git a/drivers/media/platform/qcom/venus/hfi.c b/drivers/media/platform/qcom/venus/hfi.c
index c09490876516..e374c7d1a618 100644
--- a/drivers/media/platform/qcom/venus/hfi.c
+++ b/drivers/media/platform/qcom/venus/hfi.c
@@ -88,12 +88,6 @@ unlock:
 	return ret;
 }
 
-static int core_deinit_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 int hfi_core_deinit(struct venus_core *core, bool blocking)
 {
 	int ret = 0, empty;
@@ -112,7 +106,7 @@ int hfi_core_deinit(struct venus_core *core, bool blocking)
 
 	if (!empty) {
 		mutex_unlock(&core->lock);
-		wait_on_atomic_t(&core->insts_count, core_deinit_wait_atomic_t,
+		wait_on_atomic_t(&core->insts_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 		mutex_lock(&core->lock);
 	}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index bb1e2caa1720..77f5420a1a24 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -41,12 +41,6 @@ static void afs_charge_preallocation(struct work_struct *);
 
 static DECLARE_WORK(afs_charge_preallocation_work, afs_charge_preallocation);
 
-static int afs_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -121,7 +115,7 @@ void afs_close_socket(void)
 	}
 
 	_debug("outstanding %u", atomic_read(&afs_outstanding_calls));
-	wait_on_atomic_t(&afs_outstanding_calls, afs_wait_atomic_t,
+	wait_on_atomic_t(&afs_outstanding_calls, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2d7e86b51d1..24cefde30e30 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4016,16 +4016,9 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
 {
-	wait_on_atomic_t(&bg->nocow_writers,
-			 btrfs_wait_nocow_writers_atomic_t,
+	wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -6595,12 +6588,6 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
 	btrfs_put_block_group(bg);
 }
 
-static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 {
 	struct btrfs_space_info *space_info = bg->space_info;
@@ -6623,8 +6610,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
 	down_write(&space_info->groups_sem);
 	up_write(&space_info->groups_sem);
 
-	wait_on_atomic_t(&bg->reservations,
-			 btrfs_wait_bg_reservations_atomic_t,
+	wait_on_atomic_t(&bg->reservations, atomic_t_wait,
 			 TASK_UNINTERRUPTIBLE);
 }
 
@@ -11106,12 +11092,6 @@ int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
 	return 1;
 }
 
-static int wait_snapshotting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 {
 	while (true) {
@@ -11120,8 +11100,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
 		ret = btrfs_start_write_no_snapshotting(root);
 		if (ret)
 			break;
-		wait_on_atomic_t(&root->will_be_snapshotted,
-				 wait_snapshotting_atomic_t,
+		wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 	}
 }
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 40d61077bead..ff84258132bb 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -558,7 +558,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 	 * have completed.
 	 */
 	if (!atomic_dec_and_test(&cookie->n_active))
-		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+		wait_on_atomic_t(&cookie->n_active, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	/* Make sure any pending writes are cancelled. */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 97ec45110957..0ff4b49a0037 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
 	return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
 }
 
-extern int fscache_wait_atomic_t(atomic_t *);
-
 /*
  * object.c
  */
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index b39d487ccfb0..249968dcbf5c 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -195,12 +195,3 @@ static void __exit fscache_exit(void)
 }
 
 module_exit(fscache_exit);
-
-/*
- * wait_on_atomic_t() sleep function for uninterruptible waiting
- */
-int fscache_wait_atomic_t(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 134d9f560240..1629056aa2c9 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -85,9 +85,9 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
-int nfs_wait_atomic_killable(atomic_t *p)
+int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode)
 {
-	return nfs_wait_killable(TASK_KILLABLE);
+	return nfs_wait_killable(mode);
 }
 
 /**
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f9a4a5524bd5..5ab17fd4700a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -388,7 +388,7 @@ extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
-extern int nfs_wait_atomic_killable(atomic_t *p);
+extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 2cabbcf2f28e..e87279e49ba3 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -129,19 +129,13 @@ static struct kobj_attribute ocfs2_attr_filecheck_set =
 					ocfs2_filecheck_show,
 					ocfs2_filecheck_store);
 
-static int ocfs2_filecheck_sysfs_wait(atomic_t *p)
-{
-	schedule();
-	return 0;
-}
-
 static void
 ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
 {
 	struct ocfs2_filecheck_entry *p;
 
 	if (!atomic_dec_and_test(&entry->fs_count))
-		wait_on_atomic_t(&entry->fs_count, ocfs2_filecheck_sysfs_wait,
+		wait_on_atomic_t(&entry->fs_count, atomic_t_wait,
 				 TASK_UNINTERRUPTIBLE);
 
 	spin_lock(&entry->fs_fcheck->fc_lock);
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index af0d495430d7..61b39eaf7cad 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -26,6 +26,8 @@ struct wait_bit_queue_entry {
 	{ .flags = p, .bit_nr = WAIT_ATOMIC_T_BIT_NR, }
 
 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode);
+typedef int wait_atomic_t_action_f(atomic_t *counter, unsigned int mode);
+
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit);
 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode);
@@ -34,7 +36,7 @@ void wake_up_atomic_t(atomic_t *p);
 int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode);
 int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout);
 int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode);
-int out_of_line_wait_on_atomic_t(atomic_t *p, int (*)(atomic_t *), unsigned int mode);
+int out_of_line_wait_on_atomic_t(atomic_t *p, wait_atomic_t_action_f action, unsigned int mode);
 struct wait_queue_head *bit_waitqueue(void *word, int bit);
 extern void __init wait_bit_init(void);
 
@@ -51,10 +53,11 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
 		},								\
 	}
 
-extern int bit_wait(struct wait_bit_key *key, int bit);
-extern int bit_wait_io(struct wait_bit_key *key, int bit);
-extern int bit_wait_timeout(struct wait_bit_key *key, int bit);
-extern int bit_wait_io_timeout(struct wait_bit_key *key, int bit);
+extern int bit_wait(struct wait_bit_key *key, int mode);
+extern int bit_wait_io(struct wait_bit_key *key, int mode);
+extern int bit_wait_timeout(struct wait_bit_key *key, int mode);
+extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode);
+extern int atomic_t_wait(atomic_t *counter, unsigned int mode);
 
 /**
  * wait_on_bit - wait for a bit to be cleared
@@ -251,7 +254,7 @@ wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action,
  * outside of the target 'word'.
  */
 static inline
-int wait_on_atomic_t(atomic_t *val, int (*action)(atomic_t *), unsigned mode)
+int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode)
 {
 	might_sleep();
 	if (atomic_read(val) == 0)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  */
 static __sched
 int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       int (*action)(atomic_t *), unsigned mode)
+		       wait_atomic_t_action_f action, unsigned int mode)
 {
 	atomic_t *val;
 	int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		val = wbq_entry->key.flags;
 		if (atomic_read(val) == 0)
 			break;
-		ret = (*action)(val);
+		ret = (*action)(val, mode);
 	} while (!ret && atomic_read(val) != 0);
 	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		},							\
 	}
 
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
-					 unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+					 wait_atomic_t_action_f action,
+					 unsigned int mode)
 {
 	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
 	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+	schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
 /**
  * wake_up_atomic_t - Wake up a waiter on a atomic_t
  * @p: The atomic_t being waited on, a kernel virtual address
-- 
cgit v1.3-14-g43fede


From 9fd29c08e52023252f0480ab8f6906a1ecc9a8d5 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 12 Nov 2017 14:49:09 -0800
Subject: bpf: improve verifier ARG_CONST_SIZE_OR_ZERO semantics

For helpers, the argument type ARG_CONST_SIZE_OR_ZERO permits the
access size to be 0 when accessing the previous argument (arg).
Right now, it requires the arg needs to be NULL when size passed
is 0 or could be 0. It also requires a non-NULL arg when the size
is proved to be non-0.

This patch changes verifier ARG_CONST_SIZE_OR_ZERO behavior
such that for size-0 or possible size-0, it is not required
the arg equal to NULL.

There are a couple of reasons for this semantics change, and
all of them intends to simplify user bpf programs which
may improve user experience and/or increase chances of
verifier acceptance. Together with the next patch which
changes bpf_probe_read arg2 type from ARG_CONST_SIZE to
ARG_CONST_SIZE_OR_ZERO, the following two examples, which
fail the verifier currently, are able to get verifier acceptance.

Example 1:
   unsigned long len = pend - pstart;
   len = len > MAX_PAYLOAD_LEN ? MAX_PAYLOAD_LEN : len;
   len &= MAX_PAYLOAD_LEN;
   bpf_probe_read(data->payload, len, pstart);

It does not have test for "len > 0" and it failed the verifier.
Users may not be aware that they have to add this test.
Converting the bpf_probe_read helper to have
ARG_CONST_SIZE_OR_ZERO helps the above code get
verifier acceptance.

Example 2:
  Here is one example where llvm "messed up" the code and
  the verifier fails.

......
   unsigned long len = pend - pstart;
   if (len > 0 && len <= MAX_PAYLOAD_LEN)
     bpf_probe_read(data->payload, len, pstart);
......

The compiler generates the following code and verifier fails:
......
39: (79) r2 = *(u64 *)(r10 -16)
40: (1f) r2 -= r8
41: (bf) r1 = r2
42: (07) r1 += -1
43: (25) if r1 > 0xffe goto pc+3
  R0=inv(id=0) R1=inv(id=0,umax_value=4094,var_off=(0x0; 0xfff))
  R2=inv(id=0) R6=map_value(id=0,off=0,ks=4,vs=4095,imm=0) R7=inv(id=0)
  R8=inv(id=0) R9=inv0 R10=fp0
44: (bf) r1 = r6
45: (bf) r3 = r8
46: (85) call bpf_probe_read#45
R2 min value is negative, either use unsigned or 'var &= const'
......

The compiler optimization is correct. If r1 = 0,
r1 - 1 = 0xffffffffffffffff > 0xffe.  If r1 != 0, r1 - 1 will not wrap.
r1 > 0xffe at insn #43 can actually capture
both "r1 > 0" and "len <= MAX_PAYLOAD_LEN".
This however causes an issue in verifier as the value range of arg2
"r2" does not properly get refined and lead to verification failure.

Relaxing bpf_prog_read arg2 from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO
allows the following simplied code:
   unsigned long len = pend - pstart;
   if (len <= MAX_PAYLOAD_LEN)
     bpf_probe_read(data->payload, len, pstart);

The llvm compiler will generate less complex code and the
verifier is able to verify that the program is okay.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 40 ++++++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 4a942e2e753d..dd54d20ace2f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -799,12 +799,13 @@ static int check_stack_read(struct bpf_verifier_env *env,
 
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
-			    int size)
+			      int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_map *map = regs[regno].map_ptr;
 
-	if (off < 0 || size <= 0 || off + size > map->value_size) {
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    off + size > map->value_size) {
 		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
@@ -814,7 +815,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-			    int off, int size)
+			    int off, int size, bool zero_size_allowed)
 {
 	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
@@ -837,7 +838,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->smin_value + off, size);
+	err = __check_map_access(env, regno, reg->smin_value + off, size,
+				 zero_size_allowed);
 	if (err) {
 		verbose(env, "R%d min value is outside of the array range\n",
 			regno);
@@ -853,7 +855,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->umax_value + off, size);
+	err = __check_map_access(env, regno, reg->umax_value + off, size,
+				 zero_size_allowed);
 	if (err)
 		verbose(env, "R%d max value is outside of the array range\n",
 			regno);
@@ -889,12 +892,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 }
 
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
-				 int off, int size)
+				 int off, int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
-	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    (u64)off + size > reg->range) {
 		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
@@ -903,7 +907,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 }
 
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
-			       int size)
+			       int size, bool zero_size_allowed)
 {
 	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
@@ -922,7 +926,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 			regno);
 		return -EACCES;
 	}
-	err = __check_packet_access(env, regno, off, size);
+	err = __check_packet_access(env, regno, off, size, zero_size_allowed);
 	if (err) {
 		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
@@ -1097,7 +1101,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			return -EACCES;
 		}
 
-		err = check_map_access(env, regno, off, size);
+		err = check_map_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 
@@ -1184,7 +1188,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				value_regno);
 			return -EACCES;
 		}
-		err = check_packet_access(env, regno, off, size);
+		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else {
@@ -1281,7 +1285,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-	    access_size <= 0) {
+	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
 		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
@@ -1319,9 +1323,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 	switch (reg->type) {
 	case PTR_TO_PACKET:
 	case PTR_TO_PACKET_META:
-		return check_packet_access(env, regno, reg->off, access_size);
+		return check_packet_access(env, regno, reg->off, access_size,
+					   zero_size_allowed);
 	case PTR_TO_MAP_VALUE:
-		return check_map_access(env, regno, reg->off, access_size);
+		return check_map_access(env, regno, reg->off, access_size,
+					zero_size_allowed);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
@@ -1415,7 +1421,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->key_size);
+						  meta->map_ptr->key_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->key_size,
@@ -1431,7 +1438,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->value_size);
+						  meta->map_ptr->value_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
-- 
cgit v1.3-14-g43fede


From 9c019e2bc4b2bd8223c8c0d4b6962478b479834d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Sun, 12 Nov 2017 14:49:10 -0800
Subject: bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO

The helper bpf_probe_read arg2 type is changed
from ARG_CONST_SIZE to ARG_CONST_SIZE_OR_ZERO to permit
size-0 buffer. Together with newer ARG_CONST_SIZE_OR_ZERO
semantics which allows non-NULL buffer with size 0,
this allows simpler bpf programs with verifier acceptance.
The previous commit which changes ARG_CONST_SIZE_OR_ZERO semantics
has details on examples.

Signed-off-by: Yonghong Song <yhs@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/trace/bpf_trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 506efe6e8ed9..a5580c670866 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,12 +78,16 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-	int ret;
+	int ret = 0;
+
+	if (unlikely(size == 0))
+		goto out;
 
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
 		memset(dst, 0, size);
 
+ out:
 	return ret;
 }
 
@@ -92,7 +96,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
 };
 
-- 
cgit v1.3-14-g43fede


From 92ee46efeb505ead3ab06d3c5ce695637ed5f152 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@akamai.com>
Date: Mon, 13 Nov 2017 16:48:47 -0500
Subject: jump_label: Invoke jump_label_test() via early_initcall()

Fengguang Wu reported that running the rcuperf test during boot can cause
the jump_label_test() to hit a WARN_ON(). The issue is that the core jump
label code relies on kernel_text_address() to detect when it can no longer
update branches that may be contained in __init sections. The
kernel_text_address() in turn assumes that if the system_state variable is
greter than or equal to SYSTEM_RUNNING then __init sections are no longer
valid (since the assumption is that they have been freed). However, when
rcuperf is setup to run in early boot it can call kernel_power_off() which
sets the system_state to SYSTEM_POWER_OFF.

Since rcuperf initialization is invoked via a module_init(), we can make
the dependency of jump_label_test() needing to complete before rcuperf
explicit by calling it via early_initcall().

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jason Baron <jbaron@akamai.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1510609727-2238-1-git-send-email-jbaron@akamai.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/jump_label.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8ff4ca4665ff..8594d24e4adc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -769,7 +769,7 @@ static __init int jump_label_test(void)
 
 	return 0;
 }
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
 #endif /* STATIC_KEYS_SELFTEST */
 
 #endif /* HAVE_JUMP_LABEL */
-- 
cgit v1.3-14-g43fede


From aea3706cfc4d952ed6d32b6d5845b5ecd99ed7f5 Mon Sep 17 00:00:00 2001
From: Miroslav Lichvar <mlichvar@redhat.com>
Date: Mon, 13 Nov 2017 14:51:31 -0800
Subject: timekeeping: Remove CONFIG_GENERIC_TIME_VSYSCALL_OLD

As of d4d1fc61eb38f (ia64: Update fsyscall gettime to use modern
vsyscall_update)the last user of CONFIG_GENERIC_TIME_VSYSCALL_OLD
have been updated, the legacy support for old-style vsyscall
implementations can be removed from the timekeeping code.

(Thanks again to Tony Luck for helping remove the last user!)

[jstultz: Commit message rework]

Signed-off-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Stephen Boyd <stephen.boyd@linaro.org>
Link: https://lkml.kernel.org/r/1510613491-16695-1-git-send-email-john.stultz@linaro.org
---
 include/linux/timekeeper_internal.h |  7 ------
 kernel/time/Kconfig                 |  4 ----
 kernel/time/timekeeping.c           | 45 -------------------------------------
 3 files changed, 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 7e9011101cb0..d315c3d6725c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -136,13 +136,6 @@ struct timekeeper {
 extern void update_vsyscall(struct timekeeper *tk);
 extern void update_vsyscall_tz(void);
 
-#elif defined(CONFIG_GENERIC_TIME_VSYSCALL_OLD)
-
-extern void update_vsyscall_old(struct timespec *ts, struct timespec *wtm,
-				struct clocksource *c, u32 mult,
-				u64 cycle_last);
-extern void update_vsyscall_tz(void);
-
 #else
 
 static inline void update_vsyscall(struct timekeeper *tk)
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..e776fc8cc1df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
 config GENERIC_TIME_VSYSCALL
 	bool
 
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
-	bool
-
 # Old style timekeeping
 config ARCH_USES_GETTIMEOFFSET
 	bool
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
-	struct timespec xt, wm;
-
-	xt = timespec64_to_timespec(tk_xtime(tk));
-	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
-			    tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
-	s64 remainder;
-
-	/*
-	* Store only full nanoseconds into xtime_nsec after rounding
-	* it up and add the remainder to the error difference.
-	* XXX - This is necessary to avoid small 1ns inconsistnecies caused
-	* by truncating the remainder in vsyscalls. However, it causes
-	* additional work to be done in timekeeping_adjust(). Once
-	* the vsyscall implementations are converted to use xtime_nsec
-	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
-	* users are removed, this can be killed.
-	*/
-	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-	if (remainder != 0) {
-		tk->tkr_mono.xtime_nsec -= remainder;
-		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-		tk->ntp_error += remainder << tk->ntp_error_shift;
-		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
-	}
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
 
 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2163,12 +2124,6 @@ void update_wall_time(void)
 	/* correct the clock when NTP error is too big */
 	timekeeping_adjust(tk, offset);
 
-	/*
-	 * XXX This can be killed once everyone converts
-	 * to the new update_vsyscall.
-	 */
-	old_vsyscall_fixup(tk);
-
 	/*
 	 * Finally, make sure that after the rounding
 	 * xtime_nsec isn't larger than NSEC_PER_SEC
-- 
cgit v1.3-14-g43fede


From 4a31b424ac0656d1bb17520ee861144fe7a19664 Mon Sep 17 00:00:00 2001
From: Vasily Averin <vvs@virtuozzo.com>
Date: Wed, 15 Nov 2017 08:47:02 +0300
Subject: perf/core: Fix memory leak triggered by perf --namespace

perf with --namespace key leaks various memory objects including namespaces

  4.14.0+
  pid_namespace          1     12   2568   12    8
  user_namespace         1     39    824   39    8
  net_namespace          1      5   6272    5    8

This happen because perf_fill_ns_link_info() struct patch ns_path:
during initialization ns_path incremented counters on related mnt and dentry,
but without lost path_put nobody decremented them back.
Leaked dentry is name of related namespace,
and its leak does not allow to free unused namespace.

Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Hari Bathini <hbathini@linux.vnet.ibm.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: commit e422267322cd ("perf: Add PERF_RECORD_NAMESPACES to include namespaces related info")
Link: http://lkml.kernel.org/r/c510711b-3904-e5e1-d296-61273d21118d@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10cdb9c26b5d..ab5ac84f82e2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6756,6 +6756,7 @@ static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
 		ns_inode = ns_path.dentry->d_inode;
 		ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
 		ns_link_info->ino = ns_inode->i_ino;
+		path_put(&ns_path);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 89ad2fa3f043a1e8daae193bcb5fe34d5f8caf28 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 14 Nov 2017 17:15:50 -0800
Subject: bpf: fix lockdep splat

pcpu_freelist_pop() needs the same lockdep awareness than
pcpu_freelist_populate() to avoid a false positive.

 [ INFO: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected ]

 switchto-defaul/12508 [HC0[0]:SC0[6]:HE0:SE0] is trying to acquire:
  (&htab->buckets[i].lock){......}, at: [<ffffffff9dc099cb>] __htab_percpu_map_update_elem+0x1cb/0x300

 and this task is already holding:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}, at: [<ffffffff9e135848>] __dev_queue_xmit+0
x868/0x1240
 which would create a new lock dependency:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...} -> (&htab->buckets[i].lock){......}

 but this new dependency connects a SOFTIRQ-irq-safe lock:
  (dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2){+.-...}
 ... which became SOFTIRQ-irq-safe at:
   [<ffffffff9db5931b>] __lock_acquire+0x42b/0x1f10
   [<ffffffff9db5b32c>] lock_acquire+0xbc/0x1b0
   [<ffffffff9da05e38>] _raw_spin_lock+0x38/0x50
   [<ffffffff9e135848>] __dev_queue_xmit+0x868/0x1240
   [<ffffffff9e136240>] dev_queue_xmit+0x10/0x20
   [<ffffffff9e1965d9>] ip_finish_output2+0x439/0x590
   [<ffffffff9e197410>] ip_finish_output+0x150/0x2f0
   [<ffffffff9e19886d>] ip_output+0x7d/0x260
   [<ffffffff9e19789e>] ip_local_out+0x5e/0xe0
   [<ffffffff9e197b25>] ip_queue_xmit+0x205/0x620
   [<ffffffff9e1b8398>] tcp_transmit_skb+0x5a8/0xcb0
   [<ffffffff9e1ba152>] tcp_write_xmit+0x242/0x1070
   [<ffffffff9e1baffc>] __tcp_push_pending_frames+0x3c/0xf0
   [<ffffffff9e1b3472>] tcp_rcv_established+0x312/0x700
   [<ffffffff9e1c1acc>] tcp_v4_do_rcv+0x11c/0x200
   [<ffffffff9e1c3dc2>] tcp_v4_rcv+0xaa2/0xc30
   [<ffffffff9e191107>] ip_local_deliver_finish+0xa7/0x240
   [<ffffffff9e191a36>] ip_local_deliver+0x66/0x200
   [<ffffffff9e19137d>] ip_rcv_finish+0xdd/0x560
   [<ffffffff9e191e65>] ip_rcv+0x295/0x510
   [<ffffffff9e12ff88>] __netif_receive_skb_core+0x988/0x1020
   [<ffffffff9e130641>] __netif_receive_skb+0x21/0x70
   [<ffffffff9e1306ff>] process_backlog+0x6f/0x230
   [<ffffffff9e132129>] net_rx_action+0x229/0x420
   [<ffffffff9da07ee8>] __do_softirq+0xd8/0x43d
   [<ffffffff9e282bcc>] do_softirq_own_stack+0x1c/0x30
   [<ffffffff9dafc2f5>] do_softirq+0x55/0x60
   [<ffffffff9dafc3a8>] __local_bh_enable_ip+0xa8/0xb0
   [<ffffffff9db4c727>] cpu_startup_entry+0x1c7/0x500
   [<ffffffff9daab333>] start_secondary+0x113/0x140

 to a SOFTIRQ-irq-unsafe lock:
  (&head->lock){+.+...}
 ... which became SOFTIRQ-irq-unsafe at:
 ...  [<ffffffff9db5971f>] __lock_acquire+0x82f/0x1f10
   [<ffffffff9db5b32c>] lock_acquire+0xbc/0x1b0
   [<ffffffff9da05e38>] _raw_spin_lock+0x38/0x50
   [<ffffffff9dc0b7fa>] pcpu_freelist_pop+0x7a/0xb0
   [<ffffffff9dc08b2c>] htab_map_alloc+0x50c/0x5f0
   [<ffffffff9dc00dc5>] SyS_bpf+0x265/0x1200
   [<ffffffff9e28195f>] entry_SYSCALL_64_fastpath+0x12/0x17

 other info that might help us debug this:

 Chain exists of:
   dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2 --> &htab->buckets[i].lock --> &head->lock

  Possible interrupt unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&head->lock);
                                local_irq_disable();
                                lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2);
                                lock(&htab->buckets[i].lock);
   <Interrupt>
     lock(dev_queue->dev->qdisc_class ?: &qdisc_tx_lock#2);

  *** DEADLOCK ***

Fixes: e19494edab82 ("bpf: introduce percpu_freelist")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/percpu_freelist.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
+	unsigned long flags;
 	int orig_cpu, cpu;
 
+	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock(&head->lock);
+			raw_spin_unlock_irqrestore(&head->lock, flags);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu)
+		if (cpu == orig_cpu) {
+			local_irq_restore(flags);
 			return NULL;
+		}
 	}
 }
-- 
cgit v1.3-14-g43fede


From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:33 -0800
Subject: mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35b66b ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt         |  8 ++++----
 arch/powerpc/mm/hugetlbpage.c       |  1 +
 arch/s390/include/asm/mmu_context.h |  4 +++-
 arch/sparc/mm/hugetlbpage.c         |  1 +
 fs/proc/task_mmu.c                  |  5 ++++-
 include/linux/mm.h                  | 36 +++++++++++++++++++++++++++++++++---
 include/linux/mm_types.h            |  3 +++
 kernel/fork.c                       |  4 ++++
 mm/debug.c                          |  6 ++++--
 mm/memory.c                         | 15 +++++++++------
 mm/oom_kill.c                       |  8 +++++---
 11 files changed, 71 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 30fd16b14196..2729e8db9492 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj
-score, and name.  This is helpful to determine why the OOM killer was
-invoked, to identify the rogue task that caused it, and to determine why
-the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
+oom_score_adj score, and name.  This is helpful to determine why the OOM
+killer was invoked, to identify the rogue task that caused it, and to
+determine why the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 1571a498a33f..a9b9083c5e49 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 /*
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index 43607bb12cc2..cf4c1cb17dcd 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
 		mm->context.asce_limit = STACK_TOP_MAX;
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
+		/* pgd_alloc() did not account this pud */
+		mm_inc_nr_puds(mm);
 		break;
 	case -PAGE_SIZE:
 		/* forked 5-level task, set new asce with new_mm->pgd */
@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
 		/* forked 2-level compat task, set new asce with new mm->pgd */
 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 				   _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
-		/* pgd_alloc() did not increase mm->nr_pmds */
+		/* pgd_alloc() did not account this pmd */
 		mm_inc_nr_pmds(mm);
 	}
 	crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 5078b7f68890..01f63b4ee2b4 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 	pud = pud_offset(pgd, start);
 	pgd_clear(pgd);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 void hugetlb_free_pgd_range(struct mmu_gather *tlb,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6744bd706ecf..a05ce6186f99 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
+	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	swap = get_mm_counter(mm, MM_SWAPENTS);
 	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
+	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
 		"VmPMD:\t%8lu kB\n"
+		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
+		puds >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 91b46f99b4d2..9af86f39d928 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
 int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
 #endif
 
-#ifdef __PAGETABLE_PUD_FOLDED
+#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
 static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 						unsigned long address)
 {
 	return 0;
 }
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_nr_puds_init(struct mm_struct *mm) {}
+static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
+static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
+
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
+
+static inline void mm_nr_puds_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_puds, 0);
+}
+
+static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_puds);
+}
+
+static inline void mm_inc_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_puds);
+}
+
+static inline void mm_dec_nr_puds(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_puds);
+}
 #endif
 
 #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 
 static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return 0;
 }
@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
 	atomic_long_set(&mm->nr_pmds, 0);
 }
 
-static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
+static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
 {
 	return atomic_long_read(&mm->nr_pmds);
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index d1b8e8f97fc2..e9e561e02d22 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -404,6 +404,9 @@ struct mm_struct {
 	atomic_long_t nr_ptes;			/* PTE page table pages */
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
+#endif
+#if CONFIG_PGTABLE_LEVELS > 3
+	atomic_long_t nr_puds;			/* PUD page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..a4eb6f289365 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
 	mm_nr_pmds_init(mm);
+	mm_nr_puds_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
+	if (mm_nr_puds(mm))
+		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
+				mm_nr_puds(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index 6726bec731c9..a12d826bb774 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d\n"
+		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
 		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
-		mm_nr_pmds((struct mm_struct *)mm),
+		mm_nr_pmds(mm),
+		mm_nr_puds(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/memory.c b/mm/memory.c
index 42fb30300bb5..6bbd4078ec98 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
 	pud = pud_offset(p4d, start);
 	p4d_clear(p4d);
 	pud_free_tlb(tlb, pud, start);
+	mm_dec_nr_puds(tlb->mm);
 }
 
 static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_5LEVEL_HACK
-	if (p4d_present(*p4d))		/* Another has populated it */
-		pud_free(mm, new);
-	else
+	if (!p4d_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		p4d_populate(mm, p4d, new);
-#else
-	if (pgd_present(*p4d))		/* Another has populated it */
+	} else	/* Another has populated it */
 		pud_free(mm, new);
-	else
+#else
+	if (!pgd_present(*p4d)) {
+		mm_inc_nr_puds(mm);
 		pgd_populate(mm, p4d, new);
+	} else	/* Another has populated it */
+		pud_free(mm, new);
 #endif /* __ARCH_HAS_5LEVEL_HACK */
 	spin_unlock(&mm->page_table_lock);
 	return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 3023919970f7..f642a45b7f14 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
+		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
+		mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
 			atomic_long_read(&task->mm->nr_ptes),
 			mm_nr_pmds(task->mm),
+			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.3-14-g43fede


From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:37 -0800
Subject: mm: introduce wrappers to access mm->nr_ptes

Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd
and nr_pud.

The patch also makes nr_ptes accounting dependent onto CONFIG_MMU.  Page
table accounting doesn't make sense if you don't have page tables.

It's preparation for consolidation of page-table counters in mm_struct.

Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/pgd.c           |  2 +-
 arch/sparc/mm/hugetlbpage.c |  2 +-
 arch/unicore32/mm/pgd.c     |  2 +-
 fs/proc/task_mmu.c          |  2 +-
 include/linux/mm.h          | 32 ++++++++++++++++++++++++++++++++
 include/linux/mm_types.h    |  2 ++
 kernel/fork.c               |  6 +++---
 mm/debug.c                  |  2 +-
 mm/huge_memory.c            | 10 +++++-----
 mm/khugepaged.c             |  2 +-
 mm/memory.c                 |  8 ++++----
 mm/oom_kill.c               |  5 ++---
 12 files changed, 54 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index c1c1a5c67da1..61e281cb29fb 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 no_pmd:
 	pud_clear(pud);
 	pmd_free(mm, pmd);
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index 01f63b4ee2b4..0112d6942288 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
index c572a28c76c9..a830a300aaa1 100644
--- a/arch/unicore32/mm/pgd.c
+++ b/arch/unicore32/mm/pgd.c
@@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
 	pte = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free(mm, pte);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 	pmd_free(mm, pmd);
 	mm_dec_nr_pmds(mm);
 free:
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a05ce6186f99..9bd2a0294ac1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
 	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
 	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9af86f39d928..2ca799f0d762 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 }
 #endif
 
+#ifdef CONFIG_MMU
+static inline void mm_nr_ptes_init(struct mm_struct *mm)
+{
+	atomic_long_set(&mm->nr_ptes, 0);
+}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return atomic_long_read(&mm->nr_ptes);
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_inc(&mm->nr_ptes);
+}
+
+static inline void mm_dec_nr_ptes(struct mm_struct *mm)
+{
+	atomic_long_dec(&mm->nr_ptes);
+}
+#else
+static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
+
+static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+{
+	return 0;
+}
+
+static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
+static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
+#endif
+
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address);
 int __pte_alloc_kernel(pmd_t *pmd, unsigned long address);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e9e561e02d22..e42048020664 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -401,7 +401,9 @@ struct mm_struct {
 	 */
 	atomic_t mm_count;
 
+#ifdef CONFIG_MMU
 	atomic_long_t nr_ptes;			/* PTE page table pages */
+#endif
 #if CONFIG_PGTABLE_LEVELS > 2
 	atomic_long_t nr_pmds;			/* PMD page table pages */
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a4eb6f289365..946922a30ede 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	atomic_long_set(&mm->nr_ptes, 0);
+	mm_nr_ptes_init(mm);
 	mm_nr_pmds_init(mm);
 	mm_nr_puds_init(mm);
 	mm->map_count = 0;
@@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (atomic_long_read(&mm->nr_ptes))
+	if (mm_nr_ptes(mm))
 		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				atomic_long_read(&mm->nr_ptes));
+				mm_nr_ptes(mm));
 	if (mm_nr_pmds(mm))
 		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
 				mm_nr_pmds(mm));
diff --git a/mm/debug.c b/mm/debug.c
index a12d826bb774..c9888a6d7875 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+		mm_nr_ptes(mm),
 		mm_nr_pmds(mm),
 		mm_nr_puds(mm),
 		mm->map_count,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cc65fb87c9db..3610d81c062a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		spin_unlock(vmf->ptl);
 		count_vm_event(THP_FAULT_ALLOC);
 	}
@@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 	if (pgtable)
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, haddr, pmd, entry);
-	atomic_long_inc(&mm->nr_ptes);
+	mm_inc_nr_ptes(mm);
 	return true;
 }
 
@@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	if (pgtable) {
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 	}
 
 	set_pmd_at(mm, addr, pmd, entry);
@@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	get_page(src_page);
 	page_dup_rmap(src_page, true);
 	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-	atomic_long_inc(&dst_mm->nr_ptes);
+	mm_inc_nr_ptes(dst_mm);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 
 	pmdp_set_wrprotect(src_mm, addr, src_pmd);
@@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
 
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pte_free(mm, pgtable);
-	atomic_long_dec(&mm->nr_ptes);
+	mm_dec_nr_ptes(mm);
 }
 
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 43cb3043311b..ea4ff259b671 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
-			atomic_long_dec(&vma->vm_mm->nr_ptes);
+			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
 		}
 	}
diff --git a/mm/memory.c b/mm/memory.c
index 6bbd4078ec98..6dec21b182b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pgtable_t token = pmd_pgtable(*pmd);
 	pmd_clear(pmd);
 	pte_free_tlb(tlb, token, addr);
-	atomic_long_dec(&tlb->mm->nr_ptes);
+	mm_dec_nr_ptes(tlb->mm);
 }
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 
 	ptl = pmd_lock(mm, pmd);
 	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
-		atomic_long_inc(&mm->nr_ptes);
+		mm_inc_nr_ptes(mm);
 		pmd_populate(mm, pmd, new);
 		new = NULL;
 	}
@@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf)
 			goto map_pte;
 		}
 
-		atomic_long_inc(&vma->vm_mm->nr_ptes);
+		mm_inc_nr_ptes(vma->vm_mm);
 		pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
 		spin_unlock(vmf->ptl);
 		vmf->prealloc_pte = NULL;
@@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf)
 	 * We are going to consume the prealloc table,
 	 * count that as nr_ptes.
 	 */
-	atomic_long_inc(&vma->vm_mm->nr_ptes);
+	mm_inc_nr_ptes(vma->vm_mm);
 	vmf->prealloc_pte = NULL;
 }
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f642a45b7f14..f9300141480e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
-		mm_nr_puds(p->mm);
+		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
 	task_unlock(p);
 
 	/*
@@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			atomic_long_read(&task->mm->nr_ptes),
+			mm_nr_ptes(task->mm),
 			mm_nr_pmds(task->mm),
 			mm_nr_puds(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
-- 
cgit v1.3-14-g43fede


From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 15 Nov 2017 17:35:40 -0800
Subject: mm: consolidate page table accounting

Currently, we account page tables separately for each page table level,
but that's redundant -- we only make use of total memory allocated to
page tables for oom_badness calculation.  We also provide the
information to userspace, but it has dubious value there too.

This patch switches page table accounting to single counter.

mm->pgtables_bytes is now used to account all page table levels.  We use
bytes, because page table size for different levels of page table tree
may be different.

The change has user-visible effect: we don't have VmPMD and VmPUD
reported in /proc/[pid]/status.  Not sure if anybody uses them.  (As
alternative, we can always report 0 kB for them.)

OOM-killer report is also slightly changed: we now report pgtables_bytes
instead of nr_ptes, nr_pmd, nr_puds.

Apart from reducing number of counters per-mm, the benefit is that we
now calculate oom_badness() more correctly for machines which have
different size of page tables depending on level or where page tables
are less than a page in size.

The only downside can be debuggability because we do not know which page
table level could leak.  But I do not remember many bugs that would be
caught by separate counters so I wouldn't lose sleep over this.

[akpm@linux-foundation.org: fix mm/huge_memory.c]
Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
[kirill.shutemov@linux.intel.com: fix build]
  Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt |  1 -
 Documentation/sysctl/vm.txt        |  8 +++---
 fs/proc/task_mmu.c                 | 11 ++------
 include/linux/mm.h                 | 58 ++++++++------------------------------
 include/linux/mm_types.h           |  8 +-----
 kernel/fork.c                      | 16 +++--------
 mm/debug.c                         |  7 ++---
 mm/huge_memory.c                   |  2 +-
 mm/oom_kill.c                      | 14 ++++-----
 9 files changed, 32 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index adba21b5ada7..ec571b9bb18a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8)
  VmExe                       size of text segment
  VmLib                       size of shared library code
  VmPTE                       size of page table entries
- VmPMD                       size of second level page tables
  VmSwap                      amount of swap used by anonymous private data
                              (shmem swap usage is not included)
  HugetlbPages                size of hugetlb memory portions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 2729e8db9492..3e579740b49f 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -629,10 +629,10 @@ oom_dump_tasks
 
 Enables a system-wide task dump (excluding kernel threads) to be produced
 when the kernel performs an OOM-killing and includes such information as
-pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
-oom_score_adj score, and name.  This is helpful to determine why the OOM
-killer was invoked, to identify the rogue task that caused it, and to
-determine why the OOM killer chose the task it did to kill.
+pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj
+score, and name.  This is helpful to determine why the OOM killer was
+invoked, to identify the rogue task that caused it, and to determine why
+the OOM killer chose the task it did to kill.
 
 If this is set to zero, this information is suppressed.  On very
 large systems with thousands of tasks it may not be feasible to dump
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9bd2a0294ac1..875231c36cb3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -26,7 +26,7 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
+	unsigned long text, lib, swap, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
 	anon = get_mm_counter(mm, MM_ANONPAGES);
@@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
-	ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm);
-	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
-	puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
 	seq_printf(m,
 		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
@@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n"
-		"VmPMD:\t%8lu kB\n"
-		"VmPUD:\t%8lu kB\n"
 		"VmSwap:\t%8lu kB\n",
 		hiwater_vm << (PAGE_SHIFT-10),
 		total_vm << (PAGE_SHIFT-10),
@@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		shmem << (PAGE_SHIFT-10),
 		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-		ptes >> 10,
-		pmds >> 10,
-		puds >> 10,
+		mm_pgtables_bytes(mm) >> 10,
 		swap << (PAGE_SHIFT-10));
 	hugetlb_report_usage(m, mm);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ca799f0d762..7c1e82a1aa77 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
 {
 	return 0;
 }
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void mm_nr_puds_init(struct mm_struct *mm) {}
 static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
 
 #else
 int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
 
-static inline void mm_nr_puds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_puds, 0);
-}
-
-static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_puds);
-}
-
 static inline void mm_inc_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_puds);
+	atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_puds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_puds);
+	atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
 }
 #endif
 
@@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
 	return 0;
 }
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return 0;
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
 static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
 
 #else
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
 
-static inline void mm_nr_pmds_init(struct mm_struct *mm)
-{
-	atomic_long_set(&mm->nr_pmds, 0);
-}
-
-static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
-{
-	return atomic_long_read(&mm->nr_pmds);
-}
-
 static inline void mm_inc_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_pmds);
+	atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_pmds(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_pmds);
+	atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
 }
 #endif
 
 #ifdef CONFIG_MMU
-static inline void mm_nr_ptes_init(struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
 {
-	atomic_long_set(&mm->nr_ptes, 0);
+	atomic_long_set(&mm->pgtables_bytes, 0);
 }
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
-	return atomic_long_read(&mm->nr_ptes);
+	return atomic_long_read(&mm->pgtables_bytes);
 }
 
 static inline void mm_inc_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_inc(&mm->nr_ptes);
+	atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 
 static inline void mm_dec_nr_ptes(struct mm_struct *mm)
 {
-	atomic_long_dec(&mm->nr_ptes);
+	atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
 }
 #else
-static inline void mm_nr_ptes_init(struct mm_struct *mm) {}
 
-static inline unsigned long mm_nr_ptes(const struct mm_struct *mm)
+static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
+static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
 {
 	return 0;
 }
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e42048020664..09643e0472fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -402,13 +402,7 @@ struct mm_struct {
 	atomic_t mm_count;
 
 #ifdef CONFIG_MMU
-	atomic_long_t nr_ptes;			/* PTE page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 2
-	atomic_long_t nr_pmds;			/* PMD page table pages */
-#endif
-#if CONFIG_PGTABLE_LEVELS > 3
-	atomic_long_t nr_puds;			/* PUD page table pages */
+	atomic_long_t pgtables_bytes;		/* PTE page table pages */
 #endif
 	int map_count;				/* number of VMAs */
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 946922a30ede..006dc5899a1a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	mm_nr_ptes_init(mm);
-	mm_nr_pmds_init(mm);
-	mm_nr_puds_init(mm);
+	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (mm_nr_ptes(mm))
-		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				mm_nr_ptes(mm));
-	if (mm_nr_pmds(mm))
-		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
-				mm_nr_pmds(mm));
-	if (mm_nr_puds(mm))
-		pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
-				mm_nr_puds(mm));
+	if (mm_pgtables_bytes(mm))
+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
diff --git a/mm/debug.c b/mm/debug.c
index c9888a6d7875..d947f3e03b0d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm)
 		"get_unmapped_area %p\n"
 #endif
 		"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
-		"pgd %p mm_users %d mm_count %d\n"
-		"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
+		"pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
 		"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
 		"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
 		"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm)
 		mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
 		mm->pgd, atomic_read(&mm->mm_users),
 		atomic_read(&mm->mm_count),
-		mm_nr_ptes(mm),
-		mm_nr_pmds(mm),
-		mm_nr_puds(mm),
+		mm_pgtables_bytes(mm),
 		mm->map_count,
 		mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
 		mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3610d81c062a..86fe697e8bfb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 			set_pmd_at(src_mm, addr, src_pmd, pmd);
 		}
 		add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-		atomic_long_inc(&dst_mm->nr_ptes);
+		mm_inc_nr_ptes(dst_mm);
 		pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 		set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 		ret = 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9300141480e..26add8a0d1f7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	 * task's rss, pagetable and swap space use.
 	 */
 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
-		mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm);
+		mm_pgtables_bytes(p->mm) / PAGE_SIZE;
 	task_unlock(p);
 
 	/*
@@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc)
  * Dumps the current memory state of all eligible tasks.  Tasks not in the same
  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
  * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * swapents, oom_score_adj value, and name.
+ * State information includes task's pid, uid, tgid, vm size, rss,
+ * pgtables_bytes, swapents, oom_score_adj value, and name.
  */
 static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
 	struct task_struct *p;
 	struct task_struct *task;
 
-	pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
+	pr_info("[ pid ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n");
 	rcu_read_lock();
 	for_each_process(p) {
 		if (oom_unkillable_task(p, memcg, nodemask))
@@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
 			continue;
 		}
 
-		pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu         %5hd %s\n",
+		pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n",
 			task->pid, from_kuid(&init_user_ns, task_uid(task)),
 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-			mm_nr_ptes(task->mm),
-			mm_nr_pmds(task->mm),
-			mm_nr_puds(task->mm),
+			mm_pgtables_bytes(task->mm),
 			get_mm_counter(task->mm, MM_SWAPENTS),
 			task->signal->oom_score_adj, task->comm);
 		task_unlock(task);
-- 
cgit v1.3-14-g43fede


From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:51 -0800
Subject: kmemcheck: remove annotations

Patch series "kmemcheck: kill kmemcheck", v2.

As discussed at LSF/MM, kill kmemcheck.

KASan is a replacement that is able to work without the limitation of
kmemcheck (single CPU, slow).  KASan is already upstream.

We are also not aware of any users of kmemcheck (or users who don't
consider KASan as a suitable replacement).

The only objection was that since KASAN wasn't supported by all GCC
versions provided by distros at that time we should hold off for 2
years, and try again.

Now that 2 years have passed, and all distros provide gcc that supports
KASAN, kill kmemcheck again for the very same reasons.

This patch (of 4):

Remove kmemcheck annotations, and calls to kmemcheck from the kernel.

[alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs]
  Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com
Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/dma-iommu.h        |  1 -
 arch/openrisc/include/asm/dma-mapping.h |  1 -
 arch/x86/Makefile                       |  5 -----
 arch/x86/include/asm/dma-mapping.h      |  1 -
 arch/x86/include/asm/xor.h              |  5 +----
 arch/x86/kernel/traps.c                 |  5 -----
 arch/x86/mm/fault.c                     |  6 ------
 drivers/char/random.c                   |  1 -
 drivers/misc/c2port/core.c              |  2 --
 fs/dcache.c                             |  2 --
 include/linux/c2port.h                  |  4 ----
 include/linux/dma-mapping.h             |  8 +-------
 include/linux/filter.h                  |  2 --
 include/linux/mm_types.h                |  8 --------
 include/linux/net.h                     |  3 ---
 include/linux/ring_buffer.h             |  3 ---
 include/linux/skbuff.h                  |  3 ---
 include/net/inet_sock.h                 |  3 ---
 include/net/inet_timewait_sock.h        |  4 ----
 include/net/sock.h                      |  3 ---
 init/main.c                             |  1 -
 kernel/bpf/core.c                       |  6 ------
 kernel/locking/lockdep.c                |  3 ---
 kernel/trace/ring_buffer.c              |  3 ---
 mm/kmemleak.c                           |  9 ---------
 mm/page_alloc.c                         | 14 --------------
 mm/slab.c                               | 14 --------------
 mm/slab.h                               |  2 --
 mm/slub.c                               | 20 --------------------
 net/core/skbuff.c                       |  5 -----
 net/core/sock.c                         |  2 --
 net/ipv4/inet_timewait_sock.c           |  3 ---
 net/ipv4/tcp_input.c                    |  1 -
 net/socket.c                            |  1 -
 34 files changed, 2 insertions(+), 152 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h
index 0722ec6be692..6821f1249300 100644
--- a/arch/arm/include/asm/dma-iommu.h
+++ b/arch/arm/include/asm/dma-iommu.h
@@ -7,7 +7,6 @@
 #include <linux/mm_types.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/kref.h>
 
 #define ARM_MAPPING_ERROR		(~(dma_addr_t)0x0)
diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h
index f41bd3cb76d9..e212a1f0b6d2 100644
--- a/arch/openrisc/include/asm/dma-mapping.h
+++ b/arch/openrisc/include/asm/dma-mapping.h
@@ -23,7 +23,6 @@
  */
 
 #include <linux/dma-debug.h>
-#include <linux/kmemcheck.h>
 #include <linux/dma-mapping.h>
 
 extern const struct dma_map_ops or1k_dma_map_ops;
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
 endif
 export CONFIG_X86_X32_ABI
 
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
-	KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
-
 #
 # If the function graph tracer is used with mcount instead of fentry,
 # '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 43cbe843de8d..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
  * Documentation/DMA-API.txt for documentation.
  */
 
-#include <linux/kmemcheck.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-debug.h>
 #include <asm/io.h>
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#elif !defined(_ASM_X86_XOR_H)
+#ifndef _ASM_X86_XOR_H
 #define _ASM_X86_XOR_H
 
 /*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
 #include <linux/edac.h>
 #endif
 
-#include <asm/kmemcheck.h>
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
-	/* Catch kmemcheck conditions! */
-	if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
-		goto exit;
-
 	/* Store the virtualized DR6 value */
 	tsk->thread.debugreg6 = dr6;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..78ca9a8ee454 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
 #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
-#include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
 #include <asm/vm86.h>			/* struct vm86			*/
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * Detect and handle instructions that would cause a page fault for
 	 * both a tracked kernel page and a userspace page.
 	 */
-	if (kmemcheck_active(regs))
-		kmemcheck_hide(regs);
 	prefetchw(&mm->mmap_sem);
 
 	if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;
-
-			if (kmemcheck_fault(regs, address, error_code))
-				return;
 		}
 
 		/* Can handle a stale RO->RW TLB: */
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 6c7ccac2679e..ec42c8bb9b0d 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -259,7 +259,6 @@
 #include <linux/cryptohash.h>
 #include <linux/fips.h>
 #include <linux/ptrace.h>
-#include <linux/kmemcheck.h>
 #include <linux/workqueue.h>
 #include <linux/irq.h>
 #include <linux/syscalls.h>
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index 1922cb8f6b88..1c5b7aec13d4 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -15,7 +15,6 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
 #include <linux/idr.h>
@@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name,
 		return ERR_PTR(-EINVAL);
 
 	c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL);
-	kmemcheck_annotate_bitfield(c2dev, flags);
 	if (unlikely(!c2dev))
 		return ERR_PTR(-ENOMEM);
 
diff --git a/fs/dcache.c b/fs/dcache.c
index bcc9f6981569..5c7df1df81ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target)
 			 */
 			unsigned int i;
 			BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
-			kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN);
-			kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN);
 			for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
 				swap(((long *) &dentry->d_iname)[i],
 				     ((long *) &target->d_iname)[i]);
diff --git a/include/linux/c2port.h b/include/linux/c2port.h
index 4efabcb51347..f2736348ca26 100644
--- a/include/linux/c2port.h
+++ b/include/linux/c2port.h
@@ -9,8 +9,6 @@
  * the Free Software Foundation
  */
 
-#include <linux/kmemcheck.h>
-
 #define C2PORT_NAME_LEN			32
 
 struct device;
@@ -22,10 +20,8 @@ struct device;
 /* Main struct */
 struct c2port_ops;
 struct c2port_device {
-	kmemcheck_bitfield_begin(flags);
 	unsigned int access:1;
 	unsigned int flash_access:1;
-	kmemcheck_bitfield_end(flags);
 
 	int id;
 	char name[C2PORT_NAME_LEN];
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index eee1499db396..e8f8e8fb244d 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -9,7 +9,6 @@
 #include <linux/dma-debug.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
-#include <linux/kmemcheck.h>
 #include <linux/bug.h>
 #include <linux/mem_encrypt.h>
 
@@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(ptr, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, virt_to_page(ptr),
 			     offset_in_page(ptr), size,
@@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
 				   unsigned long attrs)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
-	int i, ents;
-	struct scatterlist *s;
+	int ents;
 
-	for_each_sg(sg, s, nents, i)
-		kmemcheck_mark_initialized(sg_virt(s), s->length);
 	BUG_ON(!valid_dma_direction(dir));
 	ents = ops->map_sg(dev, sg, nents, dir, attrs);
 	BUG_ON(ents < 0);
@@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev,
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	dma_addr_t addr;
 
-	kmemcheck_mark_initialized(page_address(page) + offset, size);
 	BUG_ON(!valid_dma_direction(dir));
 	addr = ops->map_page(dev, page, offset, size, dir, attrs);
 	debug_dma_map_page(dev, page, offset, size, dir, addr, false);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 48ec57e70f9f..42197b16dd78 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -454,13 +454,11 @@ struct bpf_binary_header {
 
 struct bpf_prog {
 	u16			pages;		/* Number of allocated pages */
-	kmemcheck_bitfield_begin(meta);
 	u16			jited:1,	/* Is our filter JIT'ed? */
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
 				dst_needed:1;	/* Do we need dst entry? */
-	kmemcheck_bitfield_end(meta);
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 09643e0472fc..cfd0ac4e5e0e 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -209,14 +209,6 @@ struct page {
 					   not kmapped, ie. highmem) */
 #endif /* WANT_PAGE_VIRTUAL */
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * kmemcheck wants to track the status of each byte in a page; this
-	 * is a pointer to such a status block. NULL if not tracked.
-	 */
-	void *shadow;
-#endif
-
 #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
 	int _last_cpupid;
 #endif
diff --git a/include/linux/net.h b/include/linux/net.h
index d97d80d7fdf8..caeb159abda5 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -22,7 +22,6 @@
 #include <linux/random.h>
 #include <linux/wait.h>
 #include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
-#include <linux/kmemcheck.h>
 #include <linux/rcupdate.h>
 #include <linux/once.h>
 #include <linux/fs.h>
@@ -111,9 +110,7 @@ struct socket_wq {
 struct socket {
 	socket_state		state;
 
-	kmemcheck_bitfield_begin(type);
 	short			type;
-	kmemcheck_bitfield_end(type);
 
 	unsigned long		flags;
 
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fa6ace66fea5..289e4d54e3e0 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -2,7 +2,6 @@
 #ifndef _LINUX_RING_BUFFER_H
 #define _LINUX_RING_BUFFER_H
 
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
 #include <linux/poll.h>
@@ -14,9 +13,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	kmemcheck_bitfield_begin(bitfield);
 	u32		type_len:5, time_delta:27;
-	kmemcheck_bitfield_end(bitfield);
 
 	u32		array[];
 };
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d448a4804aea..aa1341474916 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -15,7 +15,6 @@
 #define _LINUX_SKBUFF_H
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/compiler.h>
 #include <linux/time.h>
 #include <linux/bug.h>
@@ -704,7 +703,6 @@ struct sk_buff {
 	/* Following fields are _not_ copied in __copy_skb_header()
 	 * Note that queue_mapping is here mostly to fill a hole.
 	 */
-	kmemcheck_bitfield_begin(flags1);
 	__u16			queue_mapping;
 
 /* if you move cloned around you also must adapt those constants */
@@ -723,7 +721,6 @@ struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				__unused:1; /* one bit hole */
-	kmemcheck_bitfield_end(flags1);
 
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index db8162dd8c0b..8e51b4a69088 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -17,7 +17,6 @@
 #define _INET_SOCK_H
 
 #include <linux/bitops.h>
-#include <linux/kmemcheck.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/jhash.h>
@@ -84,7 +83,6 @@ struct inet_request_sock {
 #define ireq_state		req.__req_common.skc_state
 #define ireq_family		req.__req_common.skc_family
 
-	kmemcheck_bitfield_begin(flags);
 	u16			snd_wscale : 4,
 				rcv_wscale : 4,
 				tstamp_ok  : 1,
@@ -93,7 +91,6 @@ struct inet_request_sock {
 				ecn_ok	   : 1,
 				acked	   : 1,
 				no_srccheck: 1;
-	kmemcheck_bitfield_end(flags);
 	u32                     ir_mark;
 	union {
 		struct ip_options_rcu __rcu	*ireq_opt;
diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
index 6a75d67a30fd..1356fa6a7566 100644
--- a/include/net/inet_timewait_sock.h
+++ b/include/net/inet_timewait_sock.h
@@ -15,8 +15,6 @@
 #ifndef _INET_TIMEWAIT_SOCK_
 #define _INET_TIMEWAIT_SOCK_
 
-
-#include <linux/kmemcheck.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/types.h>
@@ -69,14 +67,12 @@ struct inet_timewait_sock {
 	/* Socket demultiplex comparisons on incoming packets. */
 	/* these three are in inet_sock */
 	__be16			tw_sport;
-	kmemcheck_bitfield_begin(flags);
 	/* And these are ours. */
 	unsigned int		tw_kill		: 1,
 				tw_transparent  : 1,
 				tw_flowlabel	: 20,
 				tw_pad		: 2,	/* 2 bits hole */
 				tw_tos		: 8;
-	kmemcheck_bitfield_end(flags);
 	struct timer_list	tw_timer;
 	struct inet_bind_bucket	*tw_tb;
 };
diff --git a/include/net/sock.h b/include/net/sock.h
index c577286dbffb..a63e6a8bb7e0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -436,7 +436,6 @@ struct sock {
 #define SK_FL_TYPE_MASK    0xffff0000
 #endif
 
-	kmemcheck_bitfield_begin(flags);
 	unsigned int		sk_padding : 1,
 				sk_kern_sock : 1,
 				sk_no_check_tx : 1,
@@ -445,8 +444,6 @@ struct sock {
 				sk_protocol  : 8,
 				sk_type      : 16;
 #define SK_PROTOCOL_MAX U8_MAX
-	kmemcheck_bitfield_end(flags);
-
 	u16			sk_gso_max_segs;
 	unsigned long	        sk_lingertime;
 	struct proto		*sk_prot_creator;
diff --git a/init/main.c b/init/main.c
index 3bdd8da90f69..859a786f7c0a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,6 @@
 #include <linux/kgdb.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
-#include <linux/kmemcheck.h>
 #include <linux/sfi.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..11ad089f2c74 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
-	kmemcheck_annotate_bitfield(fp, meta);
-
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	if (fp == NULL) {
 		__bpf_prog_uncharge(fp_old->aux->user, delta);
 	} else {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
@@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
 
 	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		/* aux->prog still points to the fp_other one, so
 		 * when promoting the clone to the real program,
 		 * this still needs to be adapted.
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
-#include <linux/kmemcheck.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
 
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 {
 	int i;
 
-	kmemcheck_mark_initialized(lock, sizeof(*lock));
-
 	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
 		lock->class_cache[i] = NULL;
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..d57fede84b38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>	/* for self test */
-#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 
 	/* account for padding bytes */
 	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index fca3452e56c1..e4738d5e9b8c 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -110,7 +110,6 @@
 #include <linux/atomic.h>
 
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
 
@@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object)
 {
 	u32 old_csum = object->checksum;
 
-	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
-		return false;
-
 	kasan_disable_current();
 	object->checksum = crc32(0, (void *)object->pointer, object->size);
 	kasan_enable_current();
@@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end,
 		if (scan_should_stop())
 			break;
 
-		/* don't scan uninitialized memory */
-		if (!kmemcheck_is_obj_initialized((unsigned long)ptr,
-						  BYTES_PER_POINTER))
-			continue;
-
 		kasan_disable_current();
 		pointer = *ptr;
 		kasan_enable_current();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6106d7e9eb0..30a464b47366 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -24,7 +24,6 @@
 #include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -1013,7 +1012,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
 	trace_mm_page_free(page, order);
-	kmemcheck_free_shadow(page, order);
 
 	/*
 	 * Check tail pages before head page information is cleared to
@@ -2669,15 +2667,6 @@ void split_page(struct page *page, unsigned int order)
 	VM_BUG_ON_PAGE(PageCompound(page), page);
 	VM_BUG_ON_PAGE(!page_count(page), page);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * Split shadow pages too, because free(page[0]) would
-	 * otherwise free the whole shadow.
-	 */
-	if (kmemcheck_page_is_tracked(page))
-		split_page(virt_to_page(page[0].shadow), order);
-#endif
-
 	for (i = 1; i < (1 << order); i++)
 		set_page_refcounted(page + i);
 	split_page_owner(page, order);
@@ -4223,9 +4212,6 @@ out:
 		page = NULL;
 	}
 
-	if (kmemcheck_enabled && page)
-		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
 	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
 
 	return page;
diff --git a/mm/slab.c b/mm/slab.c
index 7a5e0888a401..c84365e9a591 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -114,7 +114,6 @@
 #include	<linux/rtmutex.h>
 #include	<linux/reciprocal_div.h>
 #include	<linux/debugobjects.h>
-#include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
 #include	<linux/sched/task_stack.h>
@@ -1433,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 	if (sk_memalloc_socks() && page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
-	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
-		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
-
-		if (cachep->ctor)
-			kmemcheck_mark_uninitialized_pages(page, nr_pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, nr_pages);
-	}
-
 	return page;
 }
 
@@ -1453,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
 	int order = cachep->gfporder;
 	unsigned long nr_freed = (1 << order);
 
-	kmemcheck_free_shadow(page, order);
-
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
 	else
@@ -3515,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
 	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, caller);
 
-	kmemcheck_slab_free(cachep, objp, cachep->object_size);
-
 	/*
 	 * Skip calling cache_free_alien() when the platform is not numa.
 	 * This will avoid cache misses that happen while accessing slabp (which
diff --git a/mm/slab.h b/mm/slab.h
index e19255638cb6..e60a3d1d8f6f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -40,7 +40,6 @@ struct kmem_cache {
 
 #include <linux/memcontrol.h>
 #include <linux/fault-inject.h>
-#include <linux/kmemcheck.h>
 #include <linux/kasan.h>
 #include <linux/kmemleak.h>
 #include <linux/random.h>
@@ -439,7 +438,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
 	for (i = 0; i < size; i++) {
 		void *object = p[i];
 
-		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
 		kmemleak_alloc_recursive(object, s->object_size, 1,
 					 s->flags, flags);
 		kasan_slab_alloc(s, object, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 51484f0fc068..ac3b50b9abec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -22,7 +22,6 @@
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
 #include <linux/kasan.h>
-#include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
@@ -1377,7 +1376,6 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 		unsigned long flags;
 
 		local_irq_save(flags);
-		kmemcheck_slab_free(s, x, s->object_size);
 		debug_check_no_locks_freed(x, s->object_size);
 		local_irq_restore(flags);
 	}
@@ -1598,22 +1596,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 		stat(s, ORDER_FALLBACK);
 	}
 
-	if (kmemcheck_enabled &&
-	    !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
-		int pages = 1 << oo_order(oo);
-
-		kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node);
-
-		/*
-		 * Objects from caches that have a constructor don't get
-		 * cleared when they're allocated, so we need to do it here.
-		 */
-		if (s->ctor)
-			kmemcheck_mark_uninitialized_pages(page, pages);
-		else
-			kmemcheck_mark_unallocated_pages(page, pages);
-	}
-
 	page->objects = oo_objects(oo);
 
 	order = compound_order(page);
@@ -1689,8 +1671,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 			check_object(s, page, p, SLUB_RED_INACTIVE);
 	}
 
-	kmemcheck_free_shadow(page, compound_order(page));
-
 	mod_lruvec_page_state(page,
 		(s->flags & SLAB_RECLAIM_ACCOUNT) ?
 		NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e140ba49b30a..6cd057b41f34 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -41,7 +41,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
@@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	if (flags & SKB_ALLOC_FCLONE) {
 		struct sk_buff_fclones *fclones;
 
 		fclones = container_of(skb, struct sk_buff_fclones, skb1);
 
-		kmemcheck_annotate_bitfield(&fclones->skb2, flags1);
 		skb->fclone = SKB_FCLONE_ORIG;
 		refcount_set(&fclones->fclone_ref, 1);
 
@@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
 	shinfo = skb_shinfo(skb);
 	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
 	atomic_set(&shinfo->dataref, 1);
-	kmemcheck_annotate_variable(shinfo->destructor_arg);
 
 	return skb;
 }
@@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 		if (!n)
 			return NULL;
 
-		kmemcheck_annotate_bitfield(n, flags1);
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
 	}
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 415f441c63b9..78401fa33ce8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		sk = kmalloc(prot->obj_size, priority);
 
 	if (sk != NULL) {
-		kmemcheck_annotate_bitfield(sk, flags);
-
 		if (security_sk_alloc(sk, family, priority))
 			goto out_free;
 
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 5b039159e67a..d451b9f19b59 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/kernel.h>
-#include <linux/kmemcheck.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <net/inet_hashtables.h>
@@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
 	if (tw) {
 		const struct inet_sock *inet = inet_sk(sk);
 
-		kmemcheck_annotate_bitfield(tw, flags);
-
 		tw->tw_dr	    = dr;
 		/* Give us an identity. */
 		tw->tw_daddr	    = inet->inet_daddr;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 887585045b27..c04d60a677a7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6195,7 +6195,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
 	if (req) {
 		struct inet_request_sock *ireq = inet_rsk(req);
 
-		kmemcheck_annotate_bitfield(ireq, flags);
 		ireq->ireq_opt = NULL;
 #if IS_ENABLED(CONFIG_IPV6)
 		ireq->pktopts = NULL;
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..42d8e9c9ccd5 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -568,7 +568,6 @@ struct socket *sock_alloc(void)
 
 	sock = SOCKET_I(inode);
 
-	kmemcheck_annotate_bitfield(sock, type);
 	inode->i_ino = get_next_ino();
 	inode->i_mode = S_IFSOCK | S_IRWXUGO;
 	inode->i_uid = current_fsuid();
-- 
cgit v1.3-14-g43fede


From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:35:54 -0800
Subject: kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK

Convert all allocations that used a NOTRACK flag to stop using it.

Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Hansen <devtimhansen@gmail.com>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/include/asm/pgalloc.h       |  2 +-
 arch/arm64/include/asm/pgalloc.h     |  2 +-
 arch/powerpc/include/asm/pgalloc.h   |  2 +-
 arch/sh/kernel/dwarf.c               |  4 ++--
 arch/sh/kernel/process.c             |  2 +-
 arch/sparc/mm/init_64.c              |  4 ++--
 arch/unicore32/include/asm/pgalloc.h |  2 +-
 arch/x86/kernel/espfix_64.c          |  2 +-
 arch/x86/mm/init.c                   |  3 +--
 arch/x86/mm/init_64.c                |  2 +-
 arch/x86/mm/pageattr.c               | 10 +++++-----
 arch/x86/mm/pgtable.c                |  2 +-
 arch/x86/platform/efi/efi_64.c       |  2 +-
 crypto/xor.c                         |  7 +------
 include/linux/thread_info.h          |  5 ++---
 init/do_mounts.c                     |  3 +--
 kernel/fork.c                        | 12 ++++++------
 kernel/signal.c                      |  3 +--
 mm/kmemcheck.c                       |  2 +-
 mm/slab.c                            |  2 +-
 mm/slab.h                            |  5 ++---
 mm/slab_common.c                     |  2 +-
 mm/slub.c                            |  4 +---
 23 files changed, 36 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index b2902a5cd780..2d7344f0e208 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 extern pgd_t *pgd_alloc(struct mm_struct *mm);
 extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 static inline void clean_pte_table(pte_t *pte)
 {
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index d25f4f137c2a..5ca6a573a701 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -26,7 +26,7 @@
 
 #define check_pgt_cache()		do { } while (0)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 #define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
 
 #if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index a14203c005f1..e11f03007b57 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp)
 }
 #endif /* MODULE */
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 #ifdef CONFIG_PPC_BOOK3S
 #include <asm/book3s/pgalloc.h>
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index e1d751ae2498..1a2526676a87 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void)
 
 	dwarf_frame_cachep = kmem_cache_create("dwarf_frames",
 			sizeof(struct dwarf_frame), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_reg_cachep = kmem_cache_create("dwarf_regs",
 			sizeof(struct dwarf_reg), 0,
-			SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL);
+			SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL);
 
 	dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ,
 						    dwarf_frame_cachep);
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index b2d9963d5978..68b1a67533ce 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -59,7 +59,7 @@ void arch_task_cache_init(void)
 
 	task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size,
 					       __alignof__(union thread_xstate),
-					       SLAB_PANIC | SLAB_NOTRACK, NULL);
+					       SLAB_PANIC, NULL);
 }
 
 #ifdef CONFIG_SH_FPU_EMU
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 61bdc1270d19..2de22d703076 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2927,7 +2927,7 @@ void __flush_tlb_all(void)
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 			    unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	pte_t *pte = NULL;
 
 	if (page)
@@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
 pgtable_t pte_alloc_one(struct mm_struct *mm,
 			unsigned long address)
 {
-	struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		return NULL;
 	if (!pgtable_page_ctor(page)) {
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 26775793c204..f0fdb268f8f2 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
 #define pgd_alloc(mm)			get_pgd_slow(mm)
 #define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
 
-#define PGALLOC_GFP	(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP	(GFP_KERNEL | __GFP_ZERO)
 
 /*
  * Allocate one PTE table.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
 # error "Need more virtual address space for the ESPFIX hack"
 #endif
 
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
 
 /* This contains the *bottom* address of the espfix stack */
 DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..ef94620ceb8a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
 		unsigned int order;
 
 		order = get_order((unsigned long)num << PAGE_SHIFT);
-		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
-						__GFP_ZERO, order);
+		return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
 	}
 
 	if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..5fa3a58b5d78 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
 	void *ptr;
 
 	if (after_bootmem)
-		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+		ptr = (void *) get_zeroed_page(GFP_ATOMIC);
 	else
 		ptr = alloc_bootmem_pages(PAGE_SIZE);
 
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 
 	if (!debug_pagealloc_enabled())
 		spin_unlock(&cpa_lock);
-	base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+	base = alloc_pages(GFP_KERNEL, 0);
 	if (!debug_pagealloc_enabled())
 		spin_lock(&cpa_lock);
 	if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 
 static int alloc_pte_page(pmd_t *pmd)
 {
-	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pte)
 		return -1;
 
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
 
 static int alloc_pmd_page(pud_t *pud)
 {
-	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+	pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
 	if (!pmd)
 		return -1;
 
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	pgd_entry = cpa->pgd + pgd_index(addr);
 
 	if (pgd_none(*pgd_entry)) {
-		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
 		if (!p4d)
 			return -1;
 
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
 	 */
 	p4d = p4d_offset(pgd_entry, addr);
 	if (p4d_none(*p4d)) {
-		pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
 		if (!pud)
 			return -1;
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
 	if (efi_enabled(EFI_OLD_MEMMAP))
 		return 0;
 
-	gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+	gfp_mask = GFP_KERNEL | __GFP_ZERO;
 	efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
 	if (!efi_pgd)
 		return -ENOMEM;
diff --git a/crypto/xor.c b/crypto/xor.c
index 263af9fb45ea..bce9fe7af40a 100644
--- a/crypto/xor.c
+++ b/crypto/xor.c
@@ -122,12 +122,7 @@ calibrate_xor_blocks(void)
 		goto out;
 	}
 
-	/*
-	 * Note: Since the memory is not actually used for _anything_ but to
-	 * test the XOR speed, we don't really want kmemcheck to warn about
-	 * reading uninitialized bytes here.
-	 */
-	b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2);
+	b1 = (void *) __get_free_pages(GFP_KERNEL, 2);
 	if (!b1) {
 		printk(KERN_WARNING "xor: Yikes!  No memory available.\n");
 		return -ENOMEM;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 4bcdf00c110f..34f053a150a9 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -44,10 +44,9 @@ enum {
 #endif
 
 #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK)
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
-				 __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT)
 #endif
 
 /*
diff --git a/init/do_mounts.c b/init/do_mounts.c
index f6d4dd764a52..7cf4f6dafd5f 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data)
 
 void __init mount_block_root(char *name, int flags)
 {
-	struct page *page = alloc_page(GFP_KERNEL |
-					__GFP_NOTRACK_FALSE_POSITIVE);
+	struct page *page = alloc_page(GFP_KERNEL);
 	char *fs_names = page_address(page);
 	char *p;
 #ifdef CONFIG_BLOCK
diff --git a/kernel/fork.c b/kernel/fork.c
index 006dc5899a1a..4e55eedba8d6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
 			arch_task_struct_size, align,
-			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+			SLAB_PANIC|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -2205,18 +2205,18 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
-			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+			SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2227,7 +2227,7 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..aa1fb9f905db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
-		override_rlimit);
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index 800d64b854ea..b3a4d61d341c 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
 	 * With kmemcheck enabled, we need to allocate a memory area for the
 	 * shadow bits as well.
 	 */
-	shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order);
+	shadow = alloc_pages_node(node, flags, order);
 	if (!shadow) {
 		if (printk_ratelimit())
 			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
diff --git a/mm/slab.c b/mm/slab.c
index c84365e9a591..183e996dde5f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
 
 	flags |= cachep->allocflags;
 
-	page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
+	page = __alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page) {
 		slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
diff --git a/mm/slab.h b/mm/slab.h
index e60a3d1d8f6f..ad657ffa44e5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
 			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
-			  SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
+			  SLAB_TEMPORARY | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
 			      SLAB_NOLEAKTRACE | \
 			      SLAB_RECLAIM_ACCOUNT | \
 			      SLAB_TEMPORARY | \
-			      SLAB_NOTRACK | \
 			      SLAB_ACCOUNT)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 175e86637afd..c8cb36774ba1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
 		SLAB_FAILSLAB | SLAB_KASAN)
 
 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
-			 SLAB_NOTRACK | SLAB_ACCOUNT)
+			 SLAB_ACCOUNT)
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index ac3b50b9abec..91aa99b4b836 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
 	struct page *page;
 	int order = oo_order(oo);
 
-	flags |= __GFP_NOTRACK;
-
 	if (node == NUMA_NO_NODE)
 		page = alloc_pages(flags, order);
 	else
@@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK;
+	flags |= __GFP_COMP;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
-- 
cgit v1.3-14-g43fede


From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001
From: "Levin, Alexander (Sasha Levin)" <alexander.levin@verizon.com>
Date: Wed, 15 Nov 2017 17:36:02 -0800
Subject: kmemcheck: rip it out

Fix up makefiles, remove references, and git rm kmemcheck.

Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Vegard Nossum <vegardno@ifi.uio.no>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Tim Hansen <devtimhansen@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/admin-guide/kernel-parameters.txt |   7 -
 Documentation/dev-tools/index.rst               |   1 -
 Documentation/dev-tools/kmemcheck.rst           | 733 ------------------------
 MAINTAINERS                                     |  10 -
 arch/x86/Kconfig                                |   3 +-
 arch/x86/include/asm/kmemcheck.h                |  42 --
 arch/x86/include/asm/string_32.h                |   9 -
 arch/x86/include/asm/string_64.h                |   8 -
 arch/x86/kernel/cpu/intel.c                     |  15 -
 arch/x86/mm/Makefile                            |   2 -
 arch/x86/mm/init.c                              |   5 +-
 arch/x86/mm/kmemcheck/Makefile                  |   1 -
 arch/x86/mm/kmemcheck/error.c                   | 227 --------
 arch/x86/mm/kmemcheck/error.h                   |  15 -
 arch/x86/mm/kmemcheck/kmemcheck.c               | 658 ---------------------
 arch/x86/mm/kmemcheck/opcode.c                  | 106 ----
 arch/x86/mm/kmemcheck/opcode.h                  |   9 -
 arch/x86/mm/kmemcheck/pte.c                     |  22 -
 arch/x86/mm/kmemcheck/pte.h                     |  10 -
 arch/x86/mm/kmemcheck/selftest.c                |  70 ---
 arch/x86/mm/kmemcheck/selftest.h                |   6 -
 arch/x86/mm/kmemcheck/shadow.c                  | 173 ------
 arch/x86/mm/kmemcheck/shadow.h                  |  18 -
 include/linux/interrupt.h                       |  15 -
 include/linux/kmemcheck.h                       | 171 ------
 kernel/softirq.c                                |  10 -
 kernel/sysctl.c                                 |  10 -
 lib/Kconfig.debug                               |   6 +-
 lib/Kconfig.kmemcheck                           |  94 ---
 mm/Kconfig.debug                                |   1 -
 mm/Makefile                                     |   2 -
 mm/kmemcheck.c                                  | 125 ----
 mm/slub.c                                       |   5 +-
 scripts/kernel-doc                              |   2 -
 tools/include/linux/kmemcheck.h                 |   8 -
 35 files changed, 7 insertions(+), 2592 deletions(-)
 delete mode 100644 Documentation/dev-tools/kmemcheck.rst
 delete mode 100644 arch/x86/mm/kmemcheck/Makefile
 delete mode 100644 arch/x86/mm/kmemcheck/kmemcheck.c
 delete mode 100644 arch/x86/mm/kmemcheck/shadow.c
 delete mode 100644 lib/Kconfig.kmemcheck

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b74e13312fdc..00bb04972612 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1864,13 +1864,6 @@
 			Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y,
 			the default is off.
 
-	kmemcheck=	[X86] Boot-time kmemcheck enable/disable/one-shot mode
-			Valid arguments: 0, 1, 2
-			kmemcheck=0 (disabled)
-			kmemcheck=1 (enabled)
-			kmemcheck=2 (one-shot mode)
-			Default: 2 (one-shot mode)
-
 	kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
 			Default is 0 (don't ignore, but inject #GP)
 
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index a81787cd47d7..e313925fb0fa 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -21,7 +21,6 @@ whole; patches welcome!
    kasan
    ubsan
    kmemleak
-   kmemcheck
    gdb-kernel-debugging
    kgdb
    kselftest
diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst
deleted file mode 100644
index 7f3d1985de74..000000000000
--- a/Documentation/dev-tools/kmemcheck.rst
+++ /dev/null
@@ -1,733 +0,0 @@
-Getting started with kmemcheck
-==============================
-
-Vegard Nossum <vegardno@ifi.uio.no>
-
-
-Introduction
-------------
-
-kmemcheck is a debugging feature for the Linux Kernel. More specifically, it
-is a dynamic checker that detects and warns about some uses of uninitialized
-memory.
-
-Userspace programmers might be familiar with Valgrind's memcheck. The main
-difference between memcheck and kmemcheck is that memcheck works for userspace
-programs only, and kmemcheck works for the kernel only. The implementations
-are of course vastly different. Because of this, kmemcheck is not as accurate
-as memcheck, but it turns out to be good enough in practice to discover real
-programmer errors that the compiler is not able to find through static
-analysis.
-
-Enabling kmemcheck on a kernel will probably slow it down to the extent that
-the machine will not be usable for normal workloads such as e.g. an
-interactive desktop. kmemcheck will also cause the kernel to use about twice
-as much memory as normal. For this reason, kmemcheck is strictly a debugging
-feature.
-
-
-Downloading
------------
-
-As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel.
-
-
-Configuring and compiling
--------------------------
-
-kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of
-configuration variables must have specific settings in order for the kmemcheck
-menu to even appear in "menuconfig". These are:
-
-- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n``
-	This option is located under "General setup" / "Optimize for size".
-
-	Without this, gcc will use certain optimizations that usually lead to
-	false positive warnings from kmemcheck. An example of this is a 16-bit
-	field in a struct, where gcc may load 32 bits, then discard the upper
-	16 bits. kmemcheck sees only the 32-bit load, and may trigger a
-	warning for the upper 16 bits (if they're uninitialized).
-
-- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y``
-	This option is located under "General setup" / "Choose SLAB
-	allocator".
-
-- ``CONFIG_FUNCTION_TRACER=n``
-	This option is located under "Kernel hacking" / "Tracers" / "Kernel
-	Function Tracer"
-
-	When function tracing is compiled in, gcc emits a call to another
-	function at the beginning of every function. This means that when the
-	page fault handler is called, the ftrace framework will be called
-	before kmemcheck has had a chance to handle the fault. If ftrace then
-	modifies memory that was tracked by kmemcheck, the result is an
-	endless recursive page fault.
-
-- ``CONFIG_DEBUG_PAGEALLOC=n``
-	This option is located under "Kernel hacking" / "Memory Debugging"
-	/ "Debug page memory allocations".
-
-In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also
-located under "Kernel hacking". With this, you will be able to get line number
-information from the kmemcheck warnings, which is extremely valuable in
-debugging a problem. This option is not mandatory, however, because it slows
-down the compilation process and produces a much bigger kernel image.
-
-Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory
-Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows
-a description of the kmemcheck configuration variables:
-
-- ``CONFIG_KMEMCHECK``
-	This must be enabled in order to use kmemcheck at all...
-
-- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT``
-	This option controls the status of kmemcheck at boot-time. "Enabled"
-	will enable kmemcheck right from the start, "disabled" will boot the
-	kernel as normal (but with the kmemcheck code compiled in, so it can
-	be enabled at run-time after the kernel has booted), and "one-shot" is
-	a special mode which will turn kmemcheck off automatically after
-	detecting the first use of uninitialized memory.
-
-	If you are using kmemcheck to actively debug a problem, then you
-	probably want to choose "enabled" here.
-
-	The one-shot mode is mostly useful in automated test setups because it
-	can prevent floods of warnings and increase the chances of the machine
-	surviving in case something is really wrong. In other cases, the one-
-	shot mode could actually be counter-productive because it would turn
-	itself off at the very first error -- in the case of a false positive
-	too -- and this would come in the way of debugging the specific
-	problem you were interested in.
-
-	If you would like to use your kernel as normal, but with a chance to
-	enable kmemcheck in case of some problem, it might be a good idea to
-	choose "disabled" here. When kmemcheck is disabled, most of the run-
-	time overhead is not incurred, and the kernel will be almost as fast
-	as normal.
-
-- ``CONFIG_KMEMCHECK_QUEUE_SIZE``
-	Select the maximum number of error reports to store in an internal
-	(fixed-size) buffer. Since errors can occur virtually anywhere and in
-	any context, we need a temporary storage area which is guaranteed not
-	to generate any other page faults when accessed. The queue will be
-	emptied as soon as a tasklet may be scheduled. If the queue is full,
-	new error reports will be lost.
-
-	The default value of 64 is probably fine. If some code produces more
-	than 64 errors within an irqs-off section, then the code is likely to
-	produce many, many more, too, and these additional reports seldom give
-	any more information (the first report is usually the most valuable
-	anyway).
-
-	This number might have to be adjusted if you are not using serial
-	console or similar to capture the kernel log. If you are using the
-	"dmesg" command to save the log, then getting a lot of kmemcheck
-	warnings might overflow the kernel log itself, and the earlier reports
-	will get lost in that way instead. Try setting this to 10 or so on
-	such a setup.
-
-- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT``
-	Select the number of shadow bytes to save along with each entry of the
-	error-report queue. These bytes indicate what parts of an allocation
-	are initialized, uninitialized, etc. and will be displayed when an
-	error is detected to help the debugging of a particular problem.
-
-	The number entered here is actually the logarithm of the number of
-	bytes that will be saved. So if you pick for example 5 here, kmemcheck
-	will save 2^5 = 32 bytes.
-
-	The default value should be fine for debugging most problems. It also
-	fits nicely within 80 columns.
-
-- ``CONFIG_KMEMCHECK_PARTIAL_OK``
-	This option (when enabled) works around certain GCC optimizations that
-	produce 32-bit reads from 16-bit variables where the upper 16 bits are
-	thrown away afterwards.
-
-	The default value (enabled) is recommended. This may of course hide
-	some real errors, but disabling it would probably produce a lot of
-	false positives.
-
-- ``CONFIG_KMEMCHECK_BITOPS_OK``
-	This option silences warnings that would be generated for bit-field
-	accesses where not all the bits are initialized at the same time. This
-	may also hide some real bugs.
-
-	This option is probably obsolete, or it should be replaced with
-	the kmemcheck-/bitfield-annotations for the code in question. The
-	default value is therefore fine.
-
-Now compile the kernel as usual.
-
-
-How to use
-----------
-
-Booting
-~~~~~~~
-
-First some information about the command-line options. There is only one
-option specific to kmemcheck, and this is called "kmemcheck". It can be used
-to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT``
-option. Its possible settings are:
-
-- ``kmemcheck=0`` (disabled)
-- ``kmemcheck=1`` (enabled)
-- ``kmemcheck=2`` (one-shot mode)
-
-If SLUB debugging has been enabled in the kernel, it may take precedence over
-kmemcheck in such a way that the slab caches which are under SLUB debugging
-will not be tracked by kmemcheck. In order to ensure that this doesn't happen
-(even though it shouldn't by default), use SLUB's boot option ``slub_debug``,
-like this: ``slub_debug=-``
-
-In fact, this option may also be used for fine-grained control over SLUB vs.
-kmemcheck. For example, if the command line includes
-``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only
-for the "dentry" slab cache, and with kmemcheck tracking all the other
-caches. This is advanced usage, however, and is not generally recommended.
-
-
-Run-time enable/disable
-~~~~~~~~~~~~~~~~~~~~~~~
-
-When the kernel has booted, it is possible to enable or disable kmemcheck at
-run-time. WARNING: This feature is still experimental and may cause false
-positive warnings to appear. Therefore, try not to use this. If you find that
-it doesn't work properly (e.g. you see an unreasonable amount of warnings), I
-will be happy to take bug reports.
-
-Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.::
-
-	$ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck
-
-The numbers are the same as for the ``kmemcheck=`` command-line option.
-
-
-Debugging
-~~~~~~~~~
-
-A typical report will look something like this::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-             ^
-
-    Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-    RSP: 0018:ffff88003cdf7d98  EFLAGS: 00210002
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-    RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84
-    RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000
-    R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e
-    R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8
-    FS:  0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000
-    CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
-    CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0
-    DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
-    DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-     [<ffffffffffffffff>] 0xffffffffffffffff
-
-The single most valuable information in this report is the RIP (or EIP on 32-
-bit) value. This will help us pinpoint exactly which instruction that caused
-the warning.
-
-If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do
-is give this address to the addr2line program, like this::
-
-	$ addr2line -e vmlinux -i ffffffff8104ede8
-	arch/x86/include/asm/string_64.h:12
-	include/asm-generic/siginfo.h:287
-	kernel/signal.c:380
-	kernel/signal.c:410
-
-The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:**
-This must be the vmlinux of the kernel that produced the warning in the
-first place! If not, the line number information will almost certainly be
-wrong.
-
-The "``-i``" tells addr2line to also print the line numbers of inlined
-functions.  In this case, the flag was very important, because otherwise,
-it would only have printed the first line, which is just a call to
-``memcpy()``, which could be called from a thousand places in the kernel, and
-is therefore not very useful.  These inlined functions would not show up in
-the stack trace above, simply because the kernel doesn't load the extra
-debugging information. This technique can of course be used with ordinary
-kernel oopses as well.
-
-In this case, it's the caller of ``memcpy()`` that is interesting, and it can be
-found in ``include/asm-generic/siginfo.h``, line 287::
-
-    281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from)
-    282 {
-    283         if (from->si_code < 0)
-    284                 memcpy(to, from, sizeof(*to));
-    285         else
-    286                 /* _sigchld is currently the largest know union member */
-    287                 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld));
-    288 }
-
-Since this was a read (kmemcheck usually warns about reads only, though it can
-warn about writes to unallocated or freed memory as well), it was probably the
-"from" argument which contained some uninitialized bytes. Following the chain
-of calls, we move upwards to see where "from" was allocated or initialized,
-``kernel/signal.c``, line 380::
-
-    359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
-    360 {
-    ...
-    367         list_for_each_entry(q, &list->list, list) {
-    368                 if (q->info.si_signo == sig) {
-    369                         if (first)
-    370                                 goto still_pending;
-    371                         first = q;
-    ...
-    377         if (first) {
-    378 still_pending:
-    379                 list_del_init(&first->list);
-    380                 copy_siginfo(info, &first->info);
-    381                 __sigqueue_free(first);
-    ...
-    392         }
-    393 }
-
-Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The
-variable ``first`` was found on a list -- passed in as the second argument to
-``collect_signal()``. We  continue our journey through the stack, to figure out
-where the item on "list" was allocated or initialized. We move to line 410::
-
-    395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
-    396                         siginfo_t *info)
-    397 {
-    ...
-    410                 collect_signal(sig, pending, info);
-    ...
-    414 }
-
-Now we need to follow the ``pending`` pointer, since that is being passed on to
-``collect_signal()`` as ``list``. At this point, we've run out of lines from the
-"addr2line" output. Not to worry, we just paste the next addresses from the
-kmemcheck stack dump, i.e.::
-
-     [<ffffffff8104f04e>] dequeue_signal+0x8e/0x170
-     [<ffffffff81050bd8>] get_signal_to_deliver+0x98/0x390
-     [<ffffffff8100b87d>] do_notify_resume+0xad/0x7d0
-     [<ffffffff8100c7b5>] int_signal+0x12/0x17
-
-	$ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \
-		ffffffff8100b87d ffffffff8100c7b5
-	kernel/signal.c:446
-	kernel/signal.c:1806
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-Remember that since these addresses were found on the stack and not as the
-RIP value, they actually point to the _next_ instruction (they are return
-addresses). This becomes obvious when we look at the code for line 446::
-
-    422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
-    423 {
-    ...
-    431                 signr = __dequeue_signal(&tsk->signal->shared_pending,
-    432						 mask, info);
-    433			/*
-    434			 * itimer signal ?
-    435			 *
-    436			 * itimers are process shared and we restart periodic
-    437			 * itimers in the signal delivery path to prevent DoS
-    438			 * attacks in the high resolution timer case. This is
-    439			 * compliant with the old way of self restarting
-    440			 * itimers, as the SIGALRM is a legacy signal and only
-    441			 * queued once. Changing the restart behaviour to
-    442			 * restart the timer in the signal dequeue path is
-    443			 * reducing the timer noise on heavy loaded !highres
-    444			 * systems too.
-    445			 */
-    446			if (unlikely(signr == SIGALRM)) {
-    ...
-    489 }
-
-So instead of looking at 446, we should be looking at 431, which is the line
-that executes just before 446. Here we see that what we are looking for is
-``&tsk->signal->shared_pending``.
-
-Our next task is now to figure out which function that puts items on this
-``shared_pending`` list. A crude, but efficient tool, is ``git grep``::
-
-	$ git grep -n 'shared_pending' kernel/
-	...
-	kernel/signal.c:828:	pending = group ? &t->signal->shared_pending : &t->pending;
-	kernel/signal.c:1339:	pending = group ? &t->signal->shared_pending : &t->pending;
-	...
-
-There were more results, but none of them were related to list operations,
-and these were the only assignments. We inspect the line numbers more closely
-and find that this is indeed where items are being added to the list::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    ...
-    890 }
-
-and::
-
-    1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
-    1310 {
-    ....
-    1339	 pending = group ? &t->signal->shared_pending : &t->pending;
-    1340	 list_add_tail(&q->list, &pending->list);
-    ....
-    1347 }
-
-In the first case, the list element we are looking for, ``q``, is being
-returned from the function ``__sigqueue_alloc()``, which looks like an
-allocation function.  Let's take a look at it::
-
-    187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
-    188						 int override_rlimit)
-    189 {
-    190		struct sigqueue *q = NULL;
-    191		struct user_struct *user;
-    192
-    193		/*
-    194		 * We won't get problems with the target's UID changing under us
-    195		 * because changing it requires RCU be used, and if t != current, the
-    196		 * caller must be holding the RCU readlock (by way of a spinlock) and
-    197		 * we use RCU protection here
-    198		 */
-    199		user = get_uid(__task_cred(t)->user);
-    200		atomic_inc(&user->sigpending);
-    201		if (override_rlimit ||
-    202		    atomic_read(&user->sigpending) <=
-    203				t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur)
-    204			q = kmem_cache_alloc(sigqueue_cachep, flags);
-    205		if (unlikely(q == NULL)) {
-    206			atomic_dec(&user->sigpending);
-    207			free_uid(user);
-    208		} else {
-    209			INIT_LIST_HEAD(&q->list);
-    210			q->flags = 0;
-    211			q->user = user;
-    212		}
-    213
-    214		return q;
-    215 }
-
-We see that this function initializes ``q->list``, ``q->flags``, and
-``q->user``. It seems that now is the time to look at the definition of
-``struct sigqueue``, e.g.::
-
-    14 struct sigqueue {
-    15	       struct list_head list;
-    16	       int flags;
-    17	       siginfo_t info;
-    18	       struct user_struct *user;
-    19 };
-
-And, you might remember, it was a ``memcpy()`` on ``&first->info`` that
-caused the warning, so this makes perfect sense. It also seems reasonable
-to assume that it is the caller of ``__sigqueue_alloc()`` that has the
-responsibility of filling out (initializing) this member.
-
-But just which fields of the struct were uninitialized? Let's look at
-kmemcheck's report again::
-
-    WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024)
-    80000000000000000000000000000000000000000088ffff0000000000000000
-     i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-	     ^
-
-These first two lines are the memory dump of the memory object itself, and
-the shadow bytemap, respectively. The memory object itself is in this case
-``&first->info``. Just beware that the start of this dump is NOT the start
-of the object itself! The position of the caret (^) corresponds with the
-address of the read (ffff88003e4a2024).
-
-The shadow bytemap dump legend is as follows:
-
-- i: initialized
-- u: uninitialized
-- a: unallocated (memory has been allocated by the slab layer, but has not
-  yet been handed off to anybody)
-- f: freed (memory has been allocated by the slab layer, but has been freed
-  by the previous owner)
-
-In order to figure out where (relative to the start of the object) the
-uninitialized memory was located, we have to look at the disassembly. For
-that, we'll need the RIP address again::
-
-    RIP: 0010:[<ffffffff8104ede8>]  [<ffffffff8104ede8>] __dequeue_signal+0xc8/0x190
-
-	$ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8:
-	ffffffff8104edc8:	mov    %r8,0x8(%r8)
-	ffffffff8104edcc:	test   %r10d,%r10d
-	ffffffff8104edcf:	js     ffffffff8104ee88 <__dequeue_signal+0x168>
-	ffffffff8104edd5:	mov    %rax,%rdx
-	ffffffff8104edd8:	mov    $0xc,%ecx
-	ffffffff8104eddd:	mov    %r13,%rdi
-	ffffffff8104ede0:	mov    $0x30,%eax
-	ffffffff8104ede5:	mov    %rdx,%rsi
-	ffffffff8104ede8:	rep movsl %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edea:	test   $0x2,%al
-	ffffffff8104edec:	je     ffffffff8104edf0 <__dequeue_signal+0xd0>
-	ffffffff8104edee:	movsw  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf0:	test   $0x1,%al
-	ffffffff8104edf2:	je     ffffffff8104edf5 <__dequeue_signal+0xd5>
-	ffffffff8104edf4:	movsb  %ds:(%rsi),%es:(%rdi)
-	ffffffff8104edf5:	mov    %r8,%rdi
-	ffffffff8104edf8:	callq  ffffffff8104de60 <__sigqueue_free>
-
-As expected, it's the "``rep movsl``" instruction from the ``memcpy()``
-that causes the warning. We know about ``REP MOVSL`` that it uses the register
-``RCX`` to count the number of remaining iterations. By taking a look at the
-register dump again (from the kmemcheck report), we can figure out how many
-bytes were left to copy::
-
-    RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009
-
-By looking at the disassembly, we also see that ``%ecx`` is being loaded
-with the value ``$0xc`` just before (ffffffff8104edd8), so we are very
-lucky. Keep in mind that this is the number of iterations, not bytes. And
-since this is a "long" operation, we need to multiply by 4 to get the
-number of bytes. So this means that the uninitialized value was encountered
-at 4 * (0xc - 0x9) = 12 bytes from the start of the object.
-
-We can now try to figure out which field of the "``struct siginfo``" that
-was not initialized. This is the beginning of the struct::
-
-    40 typedef struct siginfo {
-    41	       int si_signo;
-    42	       int si_errno;
-    43	       int si_code;
-    44
-    45	       union {
-    ..
-    92	       } _sifields;
-    93 } siginfo_t;
-
-On 64-bit, the int is 4 bytes long, so it must the union member that has
-not been initialized. We can verify this using gdb::
-
-	$ gdb vmlinux
-	...
-	(gdb) p &((struct siginfo *) 0)->_sifields
-	$1 = (union {...} *) 0x10
-
-Actually, it seems that the union member is located at offset 0x10 -- which
-means that gcc has inserted 4 bytes of padding between the members ``si_code``
-and ``_sifields``. We can now get a fuller picture of the memory dump::
-
-		 _----------------------------=> si_code
-		/	 _--------------------=> (padding)
-	       |	/	 _------------=> _sifields(._kill._pid)
-	       |       |	/	 _----=> _sifields(._kill._uid)
-	       |       |       |	/
-	-------|-------|-------|-------|
-	80000000000000000000000000000000000000000088ffff0000000000000000
-	 i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u
-
-This allows us to realize another important fact: ``si_code`` contains the
-value 0x80. Remember that x86 is little endian, so the first 4 bytes
-"80000000" are really the number 0x00000080. With a bit of research, we
-find that this is actually the constant ``SI_KERNEL`` defined in
-``include/asm-generic/siginfo.h``::
-
-    144 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere	 */
-
-This macro is used in exactly one place in the x86 kernel: In ``send_signal()``
-in ``kernel/signal.c``::
-
-    816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-    817				int group)
-    818 {
-    ...
-    828		pending = group ? &t->signal->shared_pending : &t->pending;
-    ...
-    851		q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-    852						     (is_si_special(info) ||
-    853						      info->si_code >= 0)));
-    854		if (q) {
-    855			list_add_tail(&q->list, &pending->list);
-    856			switch ((unsigned long) info) {
-    ...
-    865			case (unsigned long) SEND_SIG_PRIV:
-    866				q->info.si_signo = sig;
-    867				q->info.si_errno = 0;
-    868				q->info.si_code = SI_KERNEL;
-    869				q->info.si_pid = 0;
-    870				q->info.si_uid = 0;
-    871				break;
-    ...
-    890 }
-
-Not only does this match with the ``.si_code`` member, it also matches the place
-we found earlier when looking for where siginfo_t objects are enqueued on the
-``shared_pending`` list.
-
-So to sum up: It seems that it is the padding introduced by the compiler
-between two struct fields that is uninitialized, and this gets reported when
-we do a ``memcpy()`` on the struct. This means that we have identified a false
-positive warning.
-
-Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls
-when both the source and destination addresses are tracked. (Instead, we copy
-the shadow bytemap as well). In this case, the destination address clearly
-was not tracked. We can dig a little deeper into the stack trace from above::
-
-	arch/x86/kernel/signal.c:805
-	arch/x86/kernel/signal.c:871
-	arch/x86/kernel/entry_64.S:694
-
-And we clearly see that the destination siginfo object is located on the
-stack::
-
-    782 static void do_signal(struct pt_regs *regs)
-    783 {
-    784		struct k_sigaction ka;
-    785		siginfo_t info;
-    ...
-    804		signr = get_signal_to_deliver(&info, &ka, regs, NULL);
-    ...
-    854 }
-
-And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the
-destination argument.
-
-Now, even though we didn't find an actual error here, the example is still a
-good one, because it shows how one would go about to find out what the report
-was all about.
-
-
-Annotating false positives
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are a few different ways to make annotations in the source code that
-will keep kmemcheck from checking and reporting certain allocations. Here
-they are:
-
-- ``__GFP_NOTRACK_FALSE_POSITIVE``
-	This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()``
-	(therefore also to other functions that end up calling one of
-	these) to indicate that the allocation should not be tracked
-	because it would lead to a false positive report. This is a "big
-	hammer" way of silencing kmemcheck; after all, even if the false
-	positive pertains to particular field in a struct, for example, we
-	will now lose the ability to find (real) errors in other parts of
-	the same struct.
-
-	Example::
-
-	    /* No warnings will ever trigger on accessing any part of x */
-	    x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE);
-
-- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and
-	``kmemcheck_annotate_bitfield(ptr, name)``
-	The first two of these three macros can be used inside struct
-	definitions to signal, respectively, the beginning and end of a
-	bitfield. Additionally, this will assign the bitfield a name, which
-	is given as an argument to the macros.
-
-	Having used these markers, one can later use
-	kmemcheck_annotate_bitfield() at the point of allocation, to indicate
-	which parts of the allocation is part of a bitfield.
-
-	Example::
-
-	    struct foo {
-		int x;
-
-		kmemcheck_bitfield_begin(flags);
-		int flag_a:1;
-		int flag_b:1;
-		kmemcheck_bitfield_end(flags);
-
-		int y;
-	    };
-
-	    struct foo *x = kmalloc(sizeof *x);
-
-	    /* No warnings will trigger on accessing the bitfield of x */
-	    kmemcheck_annotate_bitfield(x, flags);
-
-	Note that ``kmemcheck_annotate_bitfield()`` can be used even before the
-	return value of ``kmalloc()`` is checked -- in other words, passing NULL
-	as the first argument is legal (and will do nothing).
-
-
-Reporting errors
-----------------
-
-As we have seen, kmemcheck will produce false positive reports. Therefore, it
-is not very wise to blindly post kmemcheck warnings to mailing lists and
-maintainers. Instead, I encourage maintainers and developers to find errors
-in their own code. If you get a warning, you can try to work around it, try
-to figure out if it's a real error or not, or simply ignore it. Most
-developers know their own code and will quickly and efficiently determine the
-root cause of a kmemcheck report. This is therefore also the most efficient
-way to work with kmemcheck.
-
-That said, we (the kmemcheck maintainers) will always be on the lookout for
-false positives that we can annotate and silence. So whatever you find,
-please drop us a note privately! Kernel configs and steps to reproduce (if
-available) are of course a great help too.
-
-Happy hacking!
-
-
-Technical description
----------------------
-
-kmemcheck works by marking memory pages non-present. This means that whenever
-somebody attempts to access the page, a page fault is generated. The page
-fault handler notices that the page was in fact only hidden, and so it calls
-on the kmemcheck code to make further investigations.
-
-When the investigations are completed, kmemcheck "shows" the page by marking
-it present (as it would be under normal circumstances). This way, the
-interrupted code can continue as usual.
-
-But after the instruction has been executed, we should hide the page again, so
-that we can catch the next access too! Now kmemcheck makes use of a debugging
-feature of the processor, namely single-stepping. When the processor has
-finished the one instruction that generated the memory access, a debug
-exception is raised. From here, we simply hide the page again and continue
-execution, this time with the single-stepping feature turned off.
-
-kmemcheck requires some assistance from the memory allocator in order to work.
-The memory allocator needs to
-
-  1. Tell kmemcheck about newly allocated pages and pages that are about to
-     be freed. This allows kmemcheck to set up and tear down the shadow memory
-     for the pages in question. The shadow memory stores the status of each
-     byte in the allocation proper, e.g. whether it is initialized or
-     uninitialized.
-
-  2. Tell kmemcheck which parts of memory should be marked uninitialized.
-     There are actually a few more states, such as "not yet allocated" and
-     "recently freed".
-
-If a slab cache is set up using the SLAB_NOTRACK flag, it will never return
-memory that can take page faults because of kmemcheck.
-
-If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still
-request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags.
-This does not prevent the page faults from occurring, however, but marks the
-object in question as being initialized so that no warnings will ever be
-produced for this object.
-
-Currently, the SLAB and SLUB allocators are supported by kmemcheck.
diff --git a/MAINTAINERS b/MAINTAINERS
index 7e9c887ad951..ac814d3dd1c1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7688,16 +7688,6 @@ F:	include/linux/kdb.h
 F:	include/linux/kgdb.h
 F:	kernel/debug/
 
-KMEMCHECK
-M:	Vegard Nossum <vegardno@ifi.uio.no>
-M:	Pekka Enberg <penberg@kernel.org>
-S:	Maintained
-F:	Documentation/dev-tools/kmemcheck.rst
-F:	arch/x86/include/asm/kmemcheck.h
-F:	arch/x86/mm/kmemcheck/
-F:	include/linux/kmemcheck.h
-F:	mm/kmemcheck.c
-
 KMEMLEAK
 M:	Catalin Marinas <catalin.marinas@arm.com>
 S:	Maintained
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f08977d82ca0..cb678192da4a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,6 @@ config X86
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_KASAN			if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_ARCH_KGDB
-	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_MMAP_RND_BITS		if MMU
 	select HAVE_ARCH_MMAP_RND_COMPAT_BITS	if MMU && COMPAT
 	select HAVE_ARCH_COMPAT_MMAP_BASES	if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
 
 config X86_DIRECT_GBPAGES
 	def_bool y
-	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+	depends on X86_64 && !DEBUG_PAGEALLOC
 	---help---
 	  Certain kernel features effectively disable kernel
 	  linear 1 GB mappings (even if the CPU otherwise
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
-	return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
-	unsigned long address, unsigned long error_code)
-{
-	return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
-	return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
  *	No 3D Now!
  */
 
-#ifndef CONFIG_KMEMCHECK
-
 #if (__GNUC__ >= 4)
 #define memcpy(t, f, n) __builtin_memcpy(t, f, n)
 #else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
 	 ? __constant_memcpy((t), (f), (n))	\
 	 : __memcpy((t), (f), (n)))
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
 
 #endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
 extern void *__memcpy(void *to, const void *from, size_t len);
 
 #ifndef CONFIG_FORTIFY_SOURCE
-#ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
 #define memcpy(dst, src, len)					\
 ({								\
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 	__ret;							\
 })
 #endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
 #endif /* !CONFIG_FORTIFY_SOURCE */
 
 #define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 	if (c->x86 == 6 && c->x86_model < 15)
 		clear_cpu_cap(c, X86_FEATURE_PAT);
 
-#ifdef CONFIG_KMEMCHECK
-	/*
-	 * P4s have a "fast strings" feature which causes single-
-	 * stepping REP instructions to only generate a #DB on
-	 * cache-line boundaries.
-	 *
-	 * Ingo Molnar reported a Pentium D (model 6) and a Xeon
-	 * (model 2) with the same problem.
-	 */
-	if (c->x86 == 15)
-		if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
-				  MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
-			pr_info("kmemcheck: Disabling fast string operations\n");
-#endif
-
 	/*
 	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
 	 * clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP)	+= debug_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
-obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
-
 KASAN_SANITIZE_kasan_init_$(BITS).o := n
 obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ef94620ceb8a..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -163,12 +163,11 @@ static int page_size_mask;
 static void __init probe_page_size_mask(void)
 {
 	/*
-	 * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
-	 * use small pages.
+	 * For pagealloc debugging, identity mapping will use small pages.
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+	if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
 		page_size_mask |= 1 << PG_LEVEL_2M;
 	else
 		direct_gbpages = 0;
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
-	KMEMCHECK_ERROR_INVALID_ACCESS,
-	KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
-	enum kmemcheck_error_type type;
-
-	union {
-		/* KMEMCHECK_ERROR_INVALID_ACCESS */
-		struct {
-			/* Kind of access that caused the error */
-			enum kmemcheck_shadow state;
-			/* Address and size of the erroneous read */
-			unsigned long	address;
-			unsigned int	size;
-		};
-	};
-
-	struct pt_regs		regs;
-	struct stack_trace	trace;
-	unsigned long		trace_entries[32];
-
-	/* We compress it to a char. */
-	unsigned char		shadow_copy[SHADOW_COPY_SIZE];
-	unsigned char		memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == ARRAY_SIZE(error_fifo)) {
-		++error_missed_count;
-		return NULL;
-	}
-
-	e = &error_fifo[error_wr];
-	if (++error_wr == ARRAY_SIZE(error_fifo))
-		error_wr = 0;
-	++error_count;
-	return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
-	struct kmemcheck_error *e;
-
-	if (error_count == 0)
-		return NULL;
-
-	e = &error_fifo[error_rd];
-	if (++error_rd == ARRAY_SIZE(error_fifo))
-		error_rd = 0;
-	--error_count;
-	return e;
-}
-
-void kmemcheck_error_recall(void)
-{
-	static const char *desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= "unallocated",
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= "uninitialized",
-		[KMEMCHECK_SHADOW_INITIALIZED]		= "initialized",
-		[KMEMCHECK_SHADOW_FREED]		= "freed",
-	};
-
-	static const char short_desc[] = {
-		[KMEMCHECK_SHADOW_UNALLOCATED]		= 'a',
-		[KMEMCHECK_SHADOW_UNINITIALIZED]	= 'u',
-		[KMEMCHECK_SHADOW_INITIALIZED]		= 'i',
-		[KMEMCHECK_SHADOW_FREED]		= 'f',
-	};
-
-	struct kmemcheck_error *e;
-	unsigned int i;
-
-	e = error_next_rd();
-	if (!e)
-		return;
-
-	switch (e->type) {
-	case KMEMCHECK_ERROR_INVALID_ACCESS:
-		printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
-			8 * e->size, e->state < ARRAY_SIZE(desc) ?
-				desc[e->state] : "(invalid shadow state)",
-			(void *) e->address);
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i)
-			printk(KERN_CONT "%02x", e->memory_copy[i]);
-		printk(KERN_CONT "\n");
-
-		printk(KERN_WARNING);
-		for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
-			if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
-				printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
-			else
-				printk(KERN_CONT " ?");
-		}
-		printk(KERN_CONT "\n");
-		printk(KERN_WARNING "%*c\n", 2 + 2
-			* (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
-		break;
-	case KMEMCHECK_ERROR_BUG:
-		printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
-		break;
-	}
-
-	__show_regs(&e->regs, 1);
-	print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
-	while (error_count > 0)
-		kmemcheck_error_recall();
-
-	if (error_missed_count > 0) {
-		printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
-			"the queue was too small\n", error_missed_count);
-		error_missed_count = 0;
-	}
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs)
-{
-	static unsigned long prev_ip;
-
-	struct kmemcheck_error *e;
-	void *shadow_copy;
-	void *memory_copy;
-
-	/* Don't report several adjacent errors from the same EIP. */
-	if (regs->ip == prev_ip)
-		return;
-	prev_ip = regs->ip;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
-	e->state = state;
-	e->address = address;
-	e->size = size;
-
-	/* Save regs */
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	/* Save stack trace */
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 0;
-	save_stack_trace_regs(regs, &e->trace);
-
-	/* Round address down to nearest 16 bytes */
-	shadow_copy = kmemcheck_shadow_lookup(address
-		& ~(SHADOW_COPY_SIZE - 1));
-	BUG_ON(!shadow_copy);
-
-	memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
-	kmemcheck_show_addr(address);
-	memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
-	memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
-	kmemcheck_hide_addr(address);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
-	struct kmemcheck_error *e;
-
-	e = error_next_wr();
-	if (!e)
-		return;
-
-	e->type = KMEMCHECK_ERROR_BUG;
-
-	memcpy(&e->regs, regs, sizeof(*regs));
-
-	e->trace.nr_entries = 0;
-	e->trace.entries = e->trace_entries;
-	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
-	e->trace.skip = 1;
-	save_stack_trace(&e->trace);
-
-	tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
-	unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008  Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-#  define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
-	/*
-	 * Limit SMP to use a single CPU. We rely on the fact that this code
-	 * runs before SMP is set up.
-	 */
-	if (setup_max_cpus > 1) {
-		printk(KERN_INFO
-			"kmemcheck: Limiting number of CPUs to 1.\n");
-		setup_max_cpus = 1;
-	}
-#endif
-
-	if (!kmemcheck_selftest()) {
-		printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
-		kmemcheck_enabled = 0;
-		return -EINVAL;
-	}
-
-	printk(KERN_INFO "kmemcheck: Initialized\n");
-	return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
-	int val;
-	int ret;
-
-	if (!str)
-		return -EINVAL;
-
-	ret = kstrtoint(str, 0, &val);
-	if (ret)
-		return ret;
-	kmemcheck_enabled = val;
-	return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
-	pte_t *pte;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return 0;
-
-	set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-	__flush_tlb_one(address);
-	return 1;
-}
-
-struct kmemcheck_context {
-	bool busy;
-	int balance;
-
-	/*
-	 * There can be at most two memory operands to an instruction, but
-	 * each address can cross a page boundary -- so we may need up to
-	 * four addresses that must be hidden/revealed for each fault.
-	 */
-	unsigned long addr[4];
-	unsigned long n_addrs;
-	unsigned long flags;
-
-	/* Data size of the instruction that caused a fault. */
-	unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
-	data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_show_addr(data->addr[i]);
-
-	return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	unsigned int i;
-	unsigned int n;
-
-	n = 0;
-	for (i = 0; i < data->n_addrs; ++i)
-		n += kmemcheck_hide_addr(data->addr[i]);
-
-	return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 0)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->balance = 0;
-		return;
-	}
-
-	/*
-	 * None of the addresses actually belonged to kmemcheck. Note that
-	 * this is not an error.
-	 */
-	if (kmemcheck_show_all() == 0)
-		return;
-
-	++data->balance;
-
-	/*
-	 * The IF needs to be cleared as well, so that the faulting
-	 * instruction can run "uninterrupted". Otherwise, we might take
-	 * an interrupt and start executing that before we've had a chance
-	 * to hide the page again.
-	 *
-	 * NOTE: In the rare case of multiple faults, we must not override
-	 * the original flags:
-	 */
-	if (!(regs->flags & X86_EFLAGS_TF))
-		data->flags = regs->flags;
-
-	regs->flags |= X86_EFLAGS_TF;
-	regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-	int n;
-
-	BUG_ON(!irqs_disabled());
-
-	if (unlikely(data->balance != 1)) {
-		kmemcheck_show_all();
-		kmemcheck_error_save_bug(regs);
-		data->n_addrs = 0;
-		data->balance = 0;
-
-		if (!(data->flags & X86_EFLAGS_TF))
-			regs->flags &= ~X86_EFLAGS_TF;
-		if (data->flags & X86_EFLAGS_IF)
-			regs->flags |= X86_EFLAGS_IF;
-		return;
-	}
-
-	if (kmemcheck_enabled)
-		n = kmemcheck_hide_all();
-	else
-		n = kmemcheck_show_all();
-
-	if (n == 0)
-		return;
-
-	--data->balance;
-
-	data->n_addrs = 0;
-
-	if (!(data->flags & X86_EFLAGS_TF))
-		regs->flags &= ~X86_EFLAGS_TF;
-	if (data->flags & X86_EFLAGS_IF)
-		regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
-	/* This will also check the "hidden" flag of the PTE. */
-	return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i) {
-		unsigned long address;
-		pte_t *pte;
-		unsigned int level;
-
-		address = (unsigned long) page_address(&p[i]);
-		pte = lookup_address(address, &level);
-		BUG_ON(!pte);
-		BUG_ON(level != PG_LEVEL_4K);
-
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
-		__flush_tlb_one(address);
-	}
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-	enum kmemcheck_shadow status;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-
-	/* Don't warn about it again. */
-	kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	enum kmemcheck_shadow status;
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return true;
-
-	status = kmemcheck_shadow_test_all(shadow, size);
-
-	return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_read_strict(regs, addr, size);
-		return;
-	}
-
-	/*
-	 * What we do is basically to split the access across the
-	 * two pages and handle each part separately. Yes, this means
-	 * that we may now see reads that are 3 + 5 bytes, for
-	 * example (and if both are uninitialized, there will be two
-	 * reports), but it makes the code a lot simpler.
-	 */
-	kmemcheck_read_strict(regs, addr, next_page - addr);
-	kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	void *shadow;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (!shadow)
-		return;
-
-	kmemcheck_save_addr(addr);
-	kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
-	unsigned long addr, unsigned int size)
-{
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long next_addr = addr + size - 1;
-	unsigned long next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		kmemcheck_write_strict(regs, addr, size);
-		return;
-	}
-
-	/* See comment in kmemcheck_read(). */
-	kmemcheck_write_strict(regs, addr, next_page - addr);
-	kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
-	unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
-	uint8_t shadow[8];
-	enum kmemcheck_shadow status;
-
-	unsigned long page;
-	unsigned long next_addr;
-	unsigned long next_page;
-
-	uint8_t *x;
-	unsigned int i;
-	unsigned int n;
-
-	BUG_ON(size > sizeof(shadow));
-
-	page = src_addr & PAGE_MASK;
-	next_addr = src_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < size; ++i)
-				shadow[i] = x[i];
-		} else {
-			for (i = 0; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	} else {
-		n = next_page - src_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(src_addr);
-		if (x) {
-			kmemcheck_save_addr(src_addr);
-			for (i = 0; i < n; ++i)
-				shadow[i] = x[i];
-		} else {
-			/* Not tracked */
-			for (i = 0; i < n; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i)
-				shadow[i] = x[i - n];
-		} else {
-			/* Not tracked */
-			for (i = n; i < size; ++i)
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-		}
-	}
-
-	page = dst_addr & PAGE_MASK;
-	next_addr = dst_addr + size - 1;
-	next_page = next_addr & PAGE_MASK;
-
-	if (likely(page == next_page)) {
-		/* Same page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < size; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	} else {
-		n = next_page - dst_addr;
-		BUG_ON(n > sizeof(shadow));
-
-		/* First page */
-		x = kmemcheck_shadow_lookup(dst_addr);
-		if (x) {
-			kmemcheck_save_addr(dst_addr);
-			for (i = 0; i < n; ++i) {
-				x[i] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-
-		/* Second page */
-		x = kmemcheck_shadow_lookup(next_page);
-		if (x) {
-			kmemcheck_save_addr(next_page);
-			for (i = n; i < size; ++i) {
-				x[i - n] = shadow[i];
-				shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
-			}
-		}
-	}
-
-	status = kmemcheck_shadow_test(shadow, size);
-	if (status == KMEMCHECK_SHADOW_INITIALIZED)
-		return;
-
-	if (kmemcheck_enabled)
-		kmemcheck_error_save(status, src_addr, size, regs);
-
-	if (kmemcheck_enabled == 2)
-		kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
-	KMEMCHECK_READ,
-	KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
-	unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
-	const uint8_t *insn;
-	const uint8_t *insn_primary;
-	unsigned int size;
-
-	struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
-	/* Recursive fault -- ouch. */
-	if (data->busy) {
-		kmemcheck_show_addr(fallback_address);
-		kmemcheck_error_save_bug(regs);
-		return;
-	}
-
-	data->busy = true;
-
-	insn = (const uint8_t *) regs->ip;
-	insn_primary = kmemcheck_opcode_get_primary(insn);
-
-	kmemcheck_opcode_decode(insn, &size);
-
-	switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
-		/* AND, OR, XOR */
-		/*
-		 * Unfortunately, these instructions have to be excluded from
-		 * our regular checking since they access only some (and not
-		 * all) bits. This clears out "bogus" bitfield-access warnings.
-		 */
-	case 0x80:
-	case 0x81:
-	case 0x82:
-	case 0x83:
-		switch ((insn_primary[1] >> 3) & 7) {
-			/* OR */
-		case 1:
-			/* AND */
-		case 4:
-			/* XOR */
-		case 6:
-			kmemcheck_write(regs, fallback_address, size);
-			goto out;
-
-			/* ADD */
-		case 0:
-			/* ADC */
-		case 2:
-			/* SBB */
-		case 3:
-			/* SUB */
-		case 5:
-			/* CMP */
-		case 7:
-			break;
-		}
-		break;
-#endif
-
-		/* MOVS, MOVSB, MOVSW, MOVSD */
-	case 0xa4:
-	case 0xa5:
-		/*
-		 * These instructions are special because they take two
-		 * addresses, but we only get one page fault.
-		 */
-		kmemcheck_copy(regs, regs->si, regs->di, size);
-		goto out;
-
-		/* CMPS, CMPSB, CMPSW, CMPSD */
-	case 0xa6:
-	case 0xa7:
-		kmemcheck_read(regs, regs->si, size);
-		kmemcheck_read(regs, regs->di, size);
-		goto out;
-	}
-
-	/*
-	 * If the opcode isn't special in any way, we use the data from the
-	 * page fault handler to determine the address and type of memory
-	 * access.
-	 */
-	switch (fallback_method) {
-	case KMEMCHECK_READ:
-		kmemcheck_read(regs, fallback_address, size);
-		goto out;
-	case KMEMCHECK_WRITE:
-		kmemcheck_write(regs, fallback_address, size);
-		goto out;
-	}
-
-out:
-	data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
-	unsigned long error_code)
-{
-	pte_t *pte;
-
-	/*
-	 * XXX: Is it safe to assume that memory accesses from virtual 86
-	 * mode or non-kernel code segments will _never_ access kernel
-	 * memory (e.g. tracked pages)? For now, we need this to avoid
-	 * invoking kmemcheck for PnP BIOS calls.
-	 */
-	if (regs->flags & X86_VM_MASK)
-		return false;
-	if (regs->cs != __KERNEL_CS)
-		return false;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return false;
-
-	WARN_ON_ONCE(in_nmi());
-
-	if (error_code & 2)
-		kmemcheck_access(regs, address, KMEMCHECK_WRITE);
-	else
-		kmemcheck_access(regs, address, KMEMCHECK_READ);
-
-	kmemcheck_show(regs);
-	return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
-	if (!kmemcheck_active(regs))
-		return false;
-
-	/* We're done. */
-	kmemcheck_hide(regs);
-	return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
-	return
-		/* Group 1 */
-		b == 0xf0 || b == 0xf2 || b == 0xf3
-		/* Group 2 */
-		|| b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
-		|| b == 0x64 || b == 0x65
-		/* Group 3 */
-		|| b == 0x66
-		/* Group 4 */
-		|| b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
-	return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
-	/* Default operand size */
-	int operand_size_override = 4;
-
-	/* prefixes */
-	for (; opcode_is_prefix(*op); ++op) {
-		if (*op == 0x66)
-			operand_size_override = 2;
-	}
-
-	/* REX prefix */
-	if (opcode_is_rex_prefix(*op)) {
-		uint8_t rex = *op;
-
-		++op;
-		if (rex & REX_W) {
-			switch (*op) {
-			case 0x63:
-				*size = 4;
-				return;
-			case 0x0f:
-				++op;
-
-				switch (*op) {
-				case 0xb6:
-				case 0xbe:
-					*size = 1;
-					return;
-				case 0xb7:
-				case 0xbf:
-					*size = 2;
-					return;
-				}
-
-				break;
-			}
-
-			*size = 8;
-			return;
-		}
-	}
-
-	/* escape opcode */
-	if (*op == 0x0f) {
-		++op;
-
-		/*
-		 * This is move with zero-extend and sign-extend, respectively;
-		 * we don't have to think about 0xb6/0xbe, because this is
-		 * already handled in the conditional below.
-		 */
-		if (*op == 0xb7 || *op == 0xbf)
-			operand_size_override = 2;
-	}
-
-	*size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
-	/* skip prefixes */
-	while (opcode_is_prefix(*op))
-		++op;
-	if (opcode_is_rex_prefix(*op))
-		++op;
-	return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
-	pte_t *pte;
-	unsigned int level;
-
-	pte = lookup_address(address, &level);
-	if (!pte)
-		return NULL;
-	if (level != PG_LEVEL_4K)
-		return NULL;
-	if (!pte_hidden(*pte))
-		return NULL;
-
-	return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
-	unsigned int expected_size;
-	const uint8_t *insn;
-	const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
-	/* REP MOVS */
-	{1, "\xf3\xa4", 		"rep movsb <mem8>, <mem8>"},
-	{4, "\xf3\xa5",			"rep movsl <mem32>, <mem32>"},
-
-	/* MOVZX / MOVZXD */
-	{1, "\x66\x0f\xb6\x51\xf8",	"movzwq <mem8>, <reg16>"},
-	{1, "\x0f\xb6\x51\xf8",		"movzwq <mem8>, <reg32>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x66\x0f\xbe\x51\xf8",	"movswq <mem8>, <reg16>"},
-	{1, "\x0f\xbe\x51\xf8",		"movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
-	/* MOVZX / MOVZXD */
-	{1, "\x49\x0f\xb6\x51\xf8",	"movzbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xb7\x51\xf8",	"movzbq <mem16>, <reg64>"},
-
-	/* MOVSX / MOVSXD */
-	{1, "\x49\x0f\xbe\x51\xf8",	"movsbq <mem8>, <reg64>"},
-	{2, "\x49\x0f\xbf\x51\xf8",	"movsbq <mem16>, <reg64>"},
-	{4, "\x49\x63\x51\xf8",		"movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
-	unsigned size;
-
-	kmemcheck_opcode_decode(op->insn, &size);
-
-	if (size == op->expected_size)
-		return true;
-
-	printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
-		op->desc, op->expected_size, size);
-	return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
-	bool pass = true;
-	unsigned int i;
-
-	for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
-		pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
-	return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
-	bool pass = true;
-
-	pass = pass && selftest_opcodes_all();
-
-	return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
-	pte_t *pte;
-	struct page *page;
-
-	if (!virt_addr_valid(address))
-		return NULL;
-
-	pte = kmemcheck_pte_lookup(address);
-	if (!pte)
-		return NULL;
-
-	page = virt_to_page(address);
-	if (!page->shadow)
-		return NULL;
-	return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
-	enum kmemcheck_shadow status)
-{
-	unsigned long addr = (unsigned long) address;
-	unsigned long last_addr = addr + n - 1;
-	unsigned long page = addr & PAGE_MASK;
-	unsigned long last_page = last_addr & PAGE_MASK;
-	unsigned int first_n;
-	void *shadow;
-
-	/* If the memory range crosses a page boundary, stop there. */
-	if (page == last_page)
-		first_n = n;
-	else
-		first_n = page + PAGE_SIZE - addr;
-
-	shadow = kmemcheck_shadow_lookup(addr);
-	if (shadow)
-		memset(shadow, status, first_n);
-
-	addr += first_n;
-	n -= first_n;
-
-	/* Do full-page memset()s. */
-	while (n >= PAGE_SIZE) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, PAGE_SIZE);
-
-		addr += PAGE_SIZE;
-		n -= PAGE_SIZE;
-	}
-
-	/* Do the remaining page, if any. */
-	if (n > 0) {
-		shadow = kmemcheck_shadow_lookup(addr);
-		if (shadow)
-			memset(shadow, status, n);
-	}
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-	mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
-	unsigned int i;
-
-	for (i = 0; i < n; ++i)
-		kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/*
-	 * Make sure _some_ bytes are initialized. Gcc frequently generates
-	 * code to access neighboring bytes.
-	 */
-	for (i = 0; i < size; ++i) {
-		if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-#else
-	return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-
-	/* All bytes must be initialized. */
-	for (i = 0; i < size; ++i) {
-		if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
-			return x[i];
-	}
-
-	return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
-	uint8_t *x;
-	unsigned int i;
-
-	x = shadow;
-	for (i = 0; i < size; ++i)
-		x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
-	KMEMCHECK_SHADOW_UNALLOCATED,
-	KMEMCHECK_SHADOW_UNINITIALIZED,
-	KMEMCHECK_SHADOW_INITIALIZED,
-	KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
-						unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index baeb872283d9..69c238210325 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t)
 		__tasklet_hi_schedule(t);
 }
 
-extern void __tasklet_hi_schedule_first(struct tasklet_struct *t);
-
-/*
- * This version avoids touching any other tasklets. Needed for kmemcheck
- * in order not to take any page faults while enqueueing this tasklet;
- * consider VERY carefully whether you really need this or
- * tasklet_hi_schedule()...
- */
-static inline void tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
-		__tasklet_hi_schedule_first(t);
-}
-
-
 static inline void tasklet_disable_nosync(struct tasklet_struct *t)
 {
 	atomic_inc(&t->count);
diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h
index 7b1d7bead7d9..ea32a7d3cf1b 100644
--- a/include/linux/kmemcheck.h
+++ b/include/linux/kmemcheck.h
@@ -1,172 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef LINUX_KMEMCHECK_H
-#define LINUX_KMEMCHECK_H
-
-#include <linux/mm_types.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMCHECK
-extern int kmemcheck_enabled;
-
-/* The slab-related functions. */
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node);
-void kmemcheck_free_shadow(struct page *page, int order);
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size);
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size);
-
-void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order,
-			       gfp_t gfpflags);
-
-void kmemcheck_show_pages(struct page *p, unsigned int n);
-void kmemcheck_hide_pages(struct page *p, unsigned int n);
-
-bool kmemcheck_page_is_tracked(struct page *p);
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n);
-void kmemcheck_mark_uninitialized(void *address, unsigned int n);
-void kmemcheck_mark_initialized(void *address, unsigned int n);
-void kmemcheck_mark_freed(void *address, unsigned int n);
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n);
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n);
-
-int kmemcheck_show_addr(unsigned long address);
-int kmemcheck_hide_addr(unsigned long address);
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size);
-
-/*
- * Bitfield annotations
- *
- * How to use: If you have a struct using bitfields, for example
- *
- *     struct a {
- *             int x:8, y:8;
- *     };
- *
- * then this should be rewritten as
- *
- *     struct a {
- *             kmemcheck_bitfield_begin(flags);
- *             int x:8, y:8;
- *             kmemcheck_bitfield_end(flags);
- *     };
- *
- * Now the "flags_begin" and "flags_end" members may be used to refer to the
- * beginning and end, respectively, of the bitfield (and things like
- * &x.flags_begin is allowed). As soon as the struct is allocated, the bit-
- * fields should be annotated:
- *
- *     struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL);
- *     kmemcheck_annotate_bitfield(a, flags);
- */
-#define kmemcheck_bitfield_begin(name)	\
-	int name##_begin[0];
-
-#define kmemcheck_bitfield_end(name)	\
-	int name##_end[0];
-
-#define kmemcheck_annotate_bitfield(ptr, name)				\
-	do {								\
-		int _n;							\
-									\
-		if (!ptr)						\
-			break;						\
-									\
-		_n = (long) &((ptr)->name##_end)			\
-			- (long) &((ptr)->name##_begin);		\
-		BUILD_BUG_ON(_n < 0);					\
-									\
-		kmemcheck_mark_initialized(&((ptr)->name##_begin), _n);	\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)				\
-	do {								\
-		kmemcheck_mark_initialized(&(var), sizeof(var));	\
-	} while (0)							\
-
-#else
-#define kmemcheck_enabled 0
-
-static inline void
-kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-}
-
-static inline void
-kmemcheck_free_shadow(struct page *page, int order)
-{
-}
-
-static inline void
-kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-		     size_t size)
-{
-}
-
-static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object,
-				       size_t size)
-{
-}
-
-static inline void kmemcheck_pagealloc_alloc(struct page *p,
-	unsigned int order, gfp_t gfpflags)
-{
-}
-
-static inline bool kmemcheck_page_is_tracked(struct page *p)
-{
-	return false;
-}
-
-static inline void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_freed(void *address, unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_unallocated_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_uninitialized_pages(struct page *p,
-						      unsigned int n)
-{
-}
-
-static inline void kmemcheck_mark_initialized_pages(struct page *p,
-						    unsigned int n)
-{
-}
-
-static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
-	return true;
-}
-
-#define kmemcheck_bitfield_begin(name)
-#define kmemcheck_bitfield_end(name)
-#define kmemcheck_annotate_bitfield(ptr, name)	\
-	do {					\
-	} while (0)
-
-#define kmemcheck_annotate_variable(var)	\
-	do {					\
-	} while (0)
-
-#endif /* CONFIG_KMEMCHECK */
-
-#endif /* LINUX_KMEMCHECK_H */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	lockdep_assert_irqs_disabled();
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
 static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9576bd582d4a..7638e2f7fff8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one_thousand,
 	},
-#endif
-#ifdef CONFIG_KMEMCHECK
-	{
-		.procname	= "kmemcheck",
-		.data		= &kmemcheck_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #endif
 	{
 		.procname	= "panic_on_warn",
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 07ce7449765a..5402e3954659 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
 
 config DEBUG_SLAB
 	bool "Debug slab memory allocations"
-	depends on DEBUG_KERNEL && SLAB && !KMEMCHECK
+	depends on DEBUG_KERNEL && SLAB
 	help
 	  Say Y here to have the kernel do limited verification on memory
 	  allocation as well as poisoning memory on free to catch use of freed
@@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK
 
 config SLUB_DEBUG_ON
 	bool "SLUB debugging on by default"
-	depends on SLUB && SLUB_DEBUG && !KMEMCHECK
+	depends on SLUB && SLUB_DEBUG
 	default n
 	help
 	  Boot with debugging on by default. SLUB boots by default with
@@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW
 
 	  If in doubt, say "N".
 
-source "lib/Kconfig.kmemcheck"
-
 source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck
deleted file mode 100644
index 846e039a86b4..000000000000
--- a/lib/Kconfig.kmemcheck
+++ /dev/null
@@ -1,94 +0,0 @@
-config HAVE_ARCH_KMEMCHECK
-	bool
-
-if HAVE_ARCH_KMEMCHECK
-
-menuconfig KMEMCHECK
-	bool "kmemcheck: trap use of uninitialized memory"
-	depends on DEBUG_KERNEL
-	depends on !X86_USE_3DNOW
-	depends on SLUB || SLAB
-	depends on !CC_OPTIMIZE_FOR_SIZE
-	depends on !FUNCTION_TRACER
-	select FRAME_POINTER
-	select STACKTRACE
-	default n
-	help
-	  This option enables tracing of dynamically allocated kernel memory
-	  to see if memory is used before it has been given an initial value.
-	  Be aware that this requires half of your memory for bookkeeping and
-	  will insert extra code at *every* read and write to tracked memory
-	  thus slow down the kernel code (but user code is unaffected).
-
-	  The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable
-	  or enable kmemcheck at boot-time. If the kernel is started with
-	  kmemcheck=0, the large memory and CPU overhead is not incurred.
-
-choice
-	prompt "kmemcheck: default mode at boot"
-	depends on KMEMCHECK
-	default KMEMCHECK_ONESHOT_BY_DEFAULT
-	help
-	  This option controls the default behaviour of kmemcheck when the
-	  kernel boots and no kmemcheck= parameter is given.
-
-config KMEMCHECK_DISABLED_BY_DEFAULT
-	bool "disabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ENABLED_BY_DEFAULT
-	bool "enabled"
-	depends on KMEMCHECK
-
-config KMEMCHECK_ONESHOT_BY_DEFAULT
-	bool "one-shot"
-	depends on KMEMCHECK
-	help
-	  In one-shot mode, only the first error detected is reported before
-	  kmemcheck is disabled.
-
-endchoice
-
-config KMEMCHECK_QUEUE_SIZE
-	int "kmemcheck: error queue size"
-	depends on KMEMCHECK
-	default 64
-	help
-	  Select the maximum number of errors to store in the queue. Since
-	  errors can occur virtually anywhere and in any context, we need a
-	  temporary storage area which is guarantueed not to generate any
-	  other faults. The queue will be emptied as soon as a tasklet may
-	  be scheduled. If the queue is full, new error reports will be
-	  lost.
-
-config KMEMCHECK_SHADOW_COPY_SHIFT
-	int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)"
-	depends on KMEMCHECK
-	range 2 8
-	default 5
-	help
-	  Select the number of shadow bytes to save along with each entry of
-	  the queue. These bytes indicate what parts of an allocation are
-	  initialized, uninitialized, etc. and will be displayed when an
-	  error is detected to help the debugging of a particular problem.
-
-config KMEMCHECK_PARTIAL_OK
-	bool "kmemcheck: allow partially uninitialized memory"
-	depends on KMEMCHECK
-	default y
-	help
-	  This option works around certain GCC optimizations that produce
-	  32-bit reads from 16-bit variables where the upper 16 bits are
-	  thrown away afterwards. This may of course also hide some real
-	  bugs.
-
-config KMEMCHECK_BITOPS_OK
-	bool "kmemcheck: allow bit-field manipulation"
-	depends on KMEMCHECK
-	default n
-	help
-	  This option silences warnings that would be generated for bit-field
-	  accesses where not all the bits are initialized at the same time.
-	  This may also hide some real bugs.
-
-endif
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5b0adf1435de..e5e606ee5f71 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC
 	bool "Debug page memory allocations"
 	depends on DEBUG_KERNEL
 	depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
-	depends on !KMEMCHECK
 	select PAGE_EXTENSION
 	select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
diff --git a/mm/Makefile b/mm/Makefile
index 4659b93cba43..e7ebd176fb93 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n
 KCOV_INSTRUMENT_page_alloc.o := n
 KCOV_INSTRUMENT_debug-pagealloc.o := n
 KCOV_INSTRUMENT_kmemleak.o := n
-KCOV_INSTRUMENT_kmemcheck.o := n
 KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
@@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index b3a4d61d341c..cec594032515 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -1,126 +1 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/gfp.h>
-#include <linux/mm_types.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include "slab.h"
-#include <linux/kmemcheck.h>
-
-void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	pages = 1 << order;
-
-	/*
-	 * With kmemcheck enabled, we need to allocate a memory area for the
-	 * shadow bits as well.
-	 */
-	shadow = alloc_pages_node(node, flags, order);
-	if (!shadow) {
-		if (printk_ratelimit())
-			pr_err("kmemcheck: failed to allocate shadow bitmap\n");
-		return;
-	}
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = page_address(&shadow[i]);
-
-	/*
-	 * Mark it as non-present for the MMU so that our accesses to
-	 * this memory will trigger a page fault and let us analyze
-	 * the memory accesses.
-	 */
-	kmemcheck_hide_pages(page, pages);
-}
-
-void kmemcheck_free_shadow(struct page *page, int order)
-{
-	struct page *shadow;
-	int pages;
-	int i;
-
-	if (!kmemcheck_page_is_tracked(page))
-		return;
-
-	pages = 1 << order;
-
-	kmemcheck_show_pages(page, pages);
-
-	shadow = virt_to_page(page[0].shadow);
-
-	for(i = 0; i < pages; ++i)
-		page[i].shadow = NULL;
-
-	__free_pages(shadow, order);
-}
-
-void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object,
-			  size_t size)
-{
-	if (unlikely(!object)) /* Skip object if allocation failed */
-		return;
-
-	/*
-	 * Has already been memset(), which initializes the shadow for us
-	 * as well.
-	 */
-	if (gfpflags & __GFP_ZERO)
-		return;
-
-	/* No need to initialize the shadow of a non-tracked slab. */
-	if (s->flags & SLAB_NOTRACK)
-		return;
-
-	if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) {
-		/*
-		 * Allow notracked objects to be allocated from
-		 * tracked caches. Note however that these objects
-		 * will still get page faults on access, they just
-		 * won't ever be flagged as uninitialized. If page
-		 * faults are not acceptable, the slab cache itself
-		 * should be marked NOTRACK.
-		 */
-		kmemcheck_mark_initialized(object, size);
-	} else if (!s->ctor) {
-		/*
-		 * New objects should be marked uninitialized before
-		 * they're returned to the called.
-		 */
-		kmemcheck_mark_uninitialized(object, size);
-	}
-}
-
-void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size)
-{
-	/* TODO: RCU freeing is unsupported for now; hide false positives. */
-	if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU))
-		kmemcheck_mark_freed(object, size);
-}
-
-void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order,
-			       gfp_t gfpflags)
-{
-	int pages;
-
-	if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK))
-		return;
-
-	pages = 1 << order;
-
-	/*
-	 * NOTE: We choose to track GFP_ZERO pages too; in fact, they
-	 * can become uninitialized by copying uninitialized memory
-	 * into them.
-	 */
-
-	/* XXX: Can use zone->node for node? */
-	kmemcheck_alloc_shadow(page, order, gfpflags, -1);
-
-	if (gfpflags & __GFP_ZERO)
-		kmemcheck_mark_initialized_pages(page, pages);
-	else
-		kmemcheck_mark_uninitialized_pages(page, pages);
-}
diff --git a/mm/slub.c b/mm/slub.c
index c2c41e178acf..cfd56e5a35fb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1371,7 +1371,7 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x)
 	 * So in order to make the debug calls that expect irqs to be
 	 * disabled we need to disable interrupts temporarily.
 	 */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+#ifdef CONFIG_LOCKDEP
 	{
 		unsigned long flags;
 
@@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
  * Compiler cannot detect this function can be removed if slab_free_hook()
  * evaluates to nothing.  Thus, catch all relevant config debug options here.
  */
-#if defined(CONFIG_KMEMCHECK) ||		\
-	defined(CONFIG_LOCKDEP)	||		\
+#if defined(CONFIG_LOCKDEP)	||		\
 	defined(CONFIG_DEBUG_KMEMLEAK) ||	\
 	defined(CONFIG_DEBUG_OBJECTS_FREE) ||	\
 	defined(CONFIG_KASAN)
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 67d051edd615..7bd52b8f63d4 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -2182,8 +2182,6 @@ sub dump_struct($$) {
 	# strip comments:
 	$members =~ s/\/\*.*?\*\///gos;
 	$nested =~ s/\/\*.*?\*\///gos;
-	# strip kmemcheck_bitfield_{begin,end}.*;
-	$members =~ s/kmemcheck_bitfield_.*?;//gos;
 	# strip attributes
 	$members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i;
 	$members =~ s/__aligned\s*\([^;]*\)//gos;
diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h
index 2bccd2c7b897..ea32a7d3cf1b 100644
--- a/tools/include/linux/kmemcheck.h
+++ b/tools/include/linux/kmemcheck.h
@@ -1,9 +1 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_
-
-static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
-}
-
-#endif
-- 
cgit v1.3-14-g43fede


From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Wed, 15 Nov 2017 17:38:03 -0800
Subject: mm: remove __GFP_COLD

As the page free path makes no distinction between cache hot and cold
pages, there is no real useful ordering of pages in the free list that
allocation requests can take advantage of.  Juding from the users of
__GFP_COLD, it is likely that a number of them are the result of copying
other sites instead of actually measuring the impact.  Remove the
__GFP_COLD parameter which simplifies a number of paths in the page
allocator.

This is potentially controversial but bear in mind that the size of the
per-cpu pagelists versus modern cache sizes means that the whole per-cpu
list can often fit in the L3 cache.  Hence, there is only a potential
benefit for microbenchmarks that alloc/free pages in a tight loop.  It's
even worse when THP is taken into account which has little or no chance
of getting a cache-hot page as the per-cpu list is bypassed and the
zeroing of multiple pages will thrash the cache anyway.

The truncate microbenchmarks are not shown as this patch affects the
allocation path and not the free path.  A page fault microbenchmark was
tested but it showed no sigificant difference which is not surprising
given that the __GFP_COLD branches are a miniscule percentage of the
fault path.

Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c         |  2 +-
 drivers/net/ethernet/amd/xgbe/xgbe-desc.c            |  2 +-
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c     |  3 +--
 .../net/ethernet/cavium/liquidio/octeon_network.h    |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c           |  5 ++---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c  |  4 ++--
 drivers/net/ethernet/qlogic/qlge/qlge_main.c         |  3 +--
 drivers/net/ethernet/sfc/falcon/rx.c                 |  2 +-
 drivers/net/ethernet/sfc/rx.c                        |  2 +-
 drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c      |  2 +-
 drivers/net/ethernet/ti/netcp_core.c                 |  2 +-
 drivers/net/virtio_net.c                             |  1 -
 drivers/staging/lustre/lustre/mdc/mdc_request.c      |  2 +-
 fs/cachefiles/rdwr.c                                 |  6 ++----
 include/linux/gfp.h                                  |  5 -----
 include/linux/pagemap.h                              |  8 +-------
 include/linux/skbuff.h                               |  2 +-
 include/linux/slab.h                                 |  3 ---
 include/trace/events/mmflags.h                       |  1 -
 kernel/power/snapshot.c                              |  4 ++--
 mm/filemap.c                                         |  6 +++---
 mm/page_alloc.c                                      | 20 ++++++--------------
 mm/percpu-vm.c                                       |  2 +-
 net/core/skbuff.c                                    |  4 ++--
 tools/perf/builtin-kmem.c                            |  1 -
 25 files changed, 32 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index c6bd5e24005d..fbbbd8b3eb45 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num)
 
 
 		rc = ena_alloc_rx_page(rx_ring, rx_info,
-				       __GFP_COLD | GFP_ATOMIC | __GFP_COMP);
+				       GFP_ATOMIC | __GFP_COMP);
 		if (unlikely(rc < 0)) {
 			netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev,
 				   "failed to alloc buffer for rx queue %d\n",
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
index 45d92304068e..cc1e4f820e64 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c
@@ -295,7 +295,7 @@ again:
 	order = alloc_order;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages_node(node, gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
index 0654e0c76bc2..519ca6534b85 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c
@@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self)
 		buff->flags = 0U;
 		buff->len = AQ_CFG_RX_FRAME_MAX;
 
-		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD |
-					 __GFP_COMP, pages_order);
+		buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
 		if (!buff->page) {
 			err = -ENOMEM;
 			goto err_exit;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
index 9e36319cead6..57853eead4b5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h
@@ -195,7 +195,7 @@ static inline void
 	struct sk_buff *skb;
 	struct octeon_skb_page_info *skb_pg_info;
 
-	page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+	page = alloc_page(GFP_ATOMIC);
 	if (unlikely(!page))
 		return NULL;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index b97a55c827eb..ffead38cf5da 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
 
 			if (mlx4_en_prepare_rx_desc(priv, ring,
 						    ring->actual_size,
-						    GFP_KERNEL | __GFP_COLD)) {
+						    GFP_KERNEL)) {
 				if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
 					en_err(priv, "Failed to allocate enough rx buffers\n");
 					return -ENOMEM;
@@ -552,8 +552,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 	do {
 		if (mlx4_en_prepare_rx_desc(priv, ring,
 					    ring->prod & ring->size_mask,
-					    GFP_ATOMIC | __GFP_COLD |
-					    __GFP_MEMALLOC))
+					    GFP_ATOMIC | __GFP_MEMALLOC))
 			break;
 		ring->prod++;
 	} while (likely(--missing));
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index e118b5f23996..ffb12dc13a5a 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_KERNEL | __GFP_COLD);
+		page = alloc_page(GFP_KERNEL);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
@@ -1212,7 +1212,7 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr)
 	} else {
 		struct page *page;
 
-		page = alloc_page(GFP_ATOMIC | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC);
 		frag = page ? page_address(page) : NULL;
 	}
 	if (!frag) {
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 29fea74bff2e..7b97a9969046 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring,
 {
 	if (!rx_ring->pg_chunk.page) {
 		u64 map;
-		rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP |
-						GFP_ATOMIC,
+		rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC,
 						qdev->lbq_buf_order);
 		if (unlikely(!rx_ring->pg_chunk.page)) {
 			netif_err(qdev, drv, qdev->ndev,
diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c
index 6a8406dc0c2b..91097aea6c41 100644
--- a/drivers/net/ethernet/sfc/falcon/rx.c
+++ b/drivers/net/ethernet/sfc/falcon/rx.c
@@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic)
 	do {
 		page = ef4_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 42443f434569..0004c50d3c83 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic)
 	do {
 		page = efx_reuse_page(rx_queue);
 		if (page == NULL) {
-			page = alloc_pages(__GFP_COLD | __GFP_COMP |
+			page = alloc_pages(__GFP_COMP |
 					   (atomic ? GFP_ATOMIC : GFP_KERNEL),
 					   efx->rx_buffer_order);
 			if (unlikely(page == NULL))
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
index e9672b1f9968..031cf9c3435a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c
@@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata,
 	dma_addr_t pages_dma;
 
 	/* Try to obtain pages, decreasing order if necessary */
-	gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN;
+	gfp |= __GFP_COMP | __GFP_NOWARN;
 	while (order >= 0) {
 		pages = alloc_pages(gfp, order);
 		if (pages)
diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c
index 437d36289786..50d2b76771b5 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq)
 		sw_data[0] = (u32)bufptr;
 	} else {
 		/* Allocate a secondary receive queue entry */
-		page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD);
+		page = alloc_page(GFP_ATOMIC | GFP_DMA);
 		if (unlikely(!page)) {
 			dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n");
 			goto fail;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 511f8339fa96..5eec09d63fc0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -988,7 +988,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq,
 	int err;
 	bool oom;
 
-	gfp |= __GFP_COLD;
 	do {
 		if (vi->mergeable_rx_bufs)
 			err = add_recvbuf_mergeable(vi, rq, gfp);
diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c
index 9e538a59f09d..03e55bca4ada 100644
--- a/drivers/staging/lustre/lustre/mdc/mdc_request.c
+++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c
@@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0)
 	}
 
 	for (npages = 1; npages < max_pages; npages++) {
-		page = page_cache_alloc_cold(inode->i_mapping);
+		page = page_cache_alloc(inode->i_mapping);
 		if (!page)
 			break;
 		page_pool[npages] = page;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 23097cca2674..883bc7bb12c5 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = __page_cache_alloc(cachefiles_gfp |
-						     __GFP_COLD);
+			newpage = __page_cache_alloc(cachefiles_gfp);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = __page_cache_alloc(cachefiles_gfp |
-							     __GFP_COLD);
+				newpage = __page_cache_alloc(cachefiles_gfp);
 				if (!newpage)
 					goto nomem;
 			}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f7e62d9096fe..1a4582b44d32 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -24,7 +24,6 @@ struct vm_area_struct;
 #define ___GFP_HIGH		0x20u
 #define ___GFP_IO		0x40u
 #define ___GFP_FS		0x80u
-#define ___GFP_COLD		0x100u
 #define ___GFP_NOWARN		0x200u
 #define ___GFP_RETRY_MAYFAIL	0x400u
 #define ___GFP_NOFAIL		0x800u
@@ -192,16 +191,12 @@ struct vm_area_struct;
 /*
  * Action modifiers
  *
- * __GFP_COLD indicates that the caller does not expect to be used in the near
- *   future. Where possible, a cache-cold page will be returned.
- *
  * __GFP_NOWARN suppresses allocation failure reports.
  *
  * __GFP_COMP address compound page metadata.
  *
  * __GFP_ZERO returns a zeroed page on success.
  */
-#define __GFP_COLD	((__force gfp_t)___GFP_COLD)
 #define __GFP_NOWARN	((__force gfp_t)___GFP_NOWARN)
 #define __GFP_COMP	((__force gfp_t)___GFP_COMP)
 #define __GFP_ZERO	((__force gfp_t)___GFP_ZERO)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 4c6790bb7afb..34ce3ebf97d5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x)
 	return __page_cache_alloc(mapping_gfp_mask(x));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
-{
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
-}
-
 static inline gfp_t readahead_gfp_mask(struct address_space *x)
 {
-	return mapping_gfp_mask(x) |
-				  __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
+	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
 }
 
 typedef int filler_t(void *, struct page *);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aa1341474916..7c46fd0b8b64 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask,
 	 * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
 	 *     code in gfp_to_alloc_flags that should be enforcing this.
 	 */
-	gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC;
+	gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;
 
 	return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order);
 }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 79f6532f8a0b..50697a1d6621 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  * Also it is possible to set different flags by OR'ing
  * in one or more of the following additional @flags:
  *
- * %__GFP_COLD - Request cache-cold pages instead of
- *   trying to return cache-warm pages.
- *
  * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
  *
  * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 72162f3a03fa..dbe1bb058c09 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -32,7 +32,6 @@
 	{(unsigned long)__GFP_ATOMIC,		"__GFP_ATOMIC"},	\
 	{(unsigned long)__GFP_IO,		"__GFP_IO"},		\
 	{(unsigned long)__GFP_FS,		"__GFP_FS"},		\
-	{(unsigned long)__GFP_COLD,		"__GFP_COLD"},		\
 	{(unsigned long)__GFP_NOWARN,		"__GFP_NOWARN"},	\
 	{(unsigned long)__GFP_RETRY_MAYFAIL,	"__GFP_RETRY_MAYFAIL"},	\
 	{(unsigned long)__GFP_NOFAIL,		"__GFP_NOFAIL"},	\
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index a917a301e201..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
  */
 static inline int get_highmem_buffer(int safe_needed)
 {
-	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	buffer = get_image_page(GFP_ATOMIC, safe_needed);
 	return buffer ? 0 : -ENOMEM;
 }
 
@@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
 		while (nr_pages-- > 0) {
 			struct page *page;
 
-			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			page = alloc_image_page(GFP_ATOMIC);
 			if (!page)
 				goto err_out;
 			memory_bm_set_bit(copy_bm, page_to_pfn(page));
diff --git a/mm/filemap.c b/mm/filemap.c
index 90a9f261f85f..923fc2ebd74a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2272,7 +2272,7 @@ no_cached_page:
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
 		 */
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc(mapping);
 		if (!page) {
 			error = -ENOMEM;
 			goto out;
@@ -2384,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
 	int ret;
 
 	do {
-		page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+		page = __page_cache_alloc(gfp_mask);
 		if (!page)
 			return -ENOMEM;
 
@@ -2788,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping,
 repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
-		page = __page_cache_alloc(gfp | __GFP_COLD);
+		page = __page_cache_alloc(gfp);
 		if (!page)
 			return ERR_PTR(-ENOMEM);
 		err = add_to_page_cache_lru(page, mapping, index, gfp);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f265d37b3152..370b64d03e3f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2336,7 +2336,7 @@ retry:
  */
 static int rmqueue_bulk(struct zone *zone, unsigned int order,
 			unsigned long count, struct list_head *list,
-			int migratetype, bool cold)
+			int migratetype)
 {
 	int i, alloced = 0;
 
@@ -2358,10 +2358,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * merge IO requests if the physical pages are ordered
 		 * properly.
 		 */
-		if (likely(!cold))
-			list_add(&page->lru, list);
-		else
-			list_add_tail(&page->lru, list);
+		list_add(&page->lru, list);
 		list = &page->lru;
 		alloced++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
@@ -2795,7 +2792,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 
 /* Remove page from the per-cpu list, caller must protect the list */
 static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
-			bool cold, struct per_cpu_pages *pcp,
+			struct per_cpu_pages *pcp,
 			struct list_head *list)
 {
 	struct page *page;
@@ -2804,16 +2801,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
 		if (list_empty(list)) {
 			pcp->count += rmqueue_bulk(zone, 0,
 					pcp->batch, list,
-					migratetype, cold);
+					migratetype);
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
 
-		if (cold)
-			page = list_last_entry(list, struct page, lru);
-		else
-			page = list_first_entry(list, struct page, lru);
-
+		page = list_first_entry(list, struct page, lru);
 		list_del(&page->lru);
 		pcp->count--;
 	} while (check_new_pcp(page));
@@ -2828,14 +2821,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
 {
 	struct per_cpu_pages *pcp;
 	struct list_head *list;
-	bool cold = ((gfp_flags & __GFP_COLD) != 0);
 	struct page *page;
 	unsigned long flags;
 
 	local_irq_save(flags);
 	pcp = &this_cpu_ptr(zone->pageset)->pcp;
 	list = &pcp->lists[migratetype];
-	page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
+	page = __rmqueue_pcplist(zone,  migratetype, pcp, list);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
 		zone_statistics(preferred_zone, zone);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 15dab691ea70..9158e5a81391 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
 			    struct page **pages, int page_start, int page_end)
 {
-	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
 	unsigned int cpu, tcpu;
 	int i;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6cd057b41f34..9c68555bb906 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
  */
 void *netdev_alloc_frag(unsigned int fragsz)
 {
-	return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(netdev_alloc_frag);
 
@@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
 
 void *napi_alloc_frag(unsigned int fragsz)
 {
-	return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
+	return __napi_alloc_frag(fragsz, GFP_ATOMIC);
 }
 EXPORT_SYMBOL(napi_alloc_frag);
 
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index cbf70738ef5f..ae11e4c3516a 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -641,7 +641,6 @@ static const struct {
 	{ "__GFP_ATOMIC",		"_A" },
 	{ "__GFP_IO",			"I" },
 	{ "__GFP_FS",			"F" },
-	{ "__GFP_COLD",			"CO" },
 	{ "__GFP_NOWARN",		"NWR" },
 	{ "__GFP_RETRY_MAYFAIL",	"R" },
 	{ "__GFP_NOFAIL",		"NF" },
-- 
cgit v1.3-14-g43fede


From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001
From: Kemi Wang <kemi.wang@intel.com>
Date: Wed, 15 Nov 2017 17:38:22 -0800
Subject: mm, sysctl: make NUMA stats configurable

This is the second step which introduces a tunable interface that allow
numa stats configurable for optimizing zone_statistics(), as suggested
by Dave Hansen and Ying Huang.

=========================================================================

When page allocation performance becomes a bottleneck and you can
tolerate some possible tool breakage and decreased numa counter
precision, you can do:

	echo 0 > /proc/sys/vm/numa_stat

In this case, numa counter update is ignored.  We can see about
*4.8%*(185->176) drop of cpu cycles per single page allocation and
reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315)
drop of cpu cycles per single page allocation and reclaim on Jesper's
page_bench03 (88 threads) running on a 2-Socket Broadwell-based server
(88 threads, 126G memory).

Benchmark link provided by Jesper D Brouer (increase loop times to
10000000):

  https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench

=========================================================================

When page allocation performance is not a bottleneck and you want all
tooling to work, you can do:

	echo 1 > /proc/sys/vm/numa_stat

This is system default setting.

Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka
for comments to help improve the original patch.

[keescook@chromium.org: make sure mutex is a global static]
  Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast
Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com
Signed-off-by: Kemi Wang <kemi.wang@intel.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Jesper Dangaard Brouer <brouer@redhat.com>
Suggested-by: Dave Hansen <dave.hansen@intel.com>
Suggested-by: Ying Huang <ying.huang@intel.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: "Luis R . Rodriguez" <mcgrof@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Christopher Lameter <cl@linux.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Tim Chen <tim.c.chen@intel.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Aaron Lu <aaron.lu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 16 ++++++++++
 include/linux/vmstat.h      | 10 +++++++
 kernel/sysctl.c             |  9 ++++++
 mm/mempolicy.c              |  3 ++
 mm/page_alloc.c             |  6 ++++
 mm/vmstat.c                 | 71 +++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 115 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3e579740b49f..055c8b3e1018 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - stat_refresh
+- numa_stat
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -799,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.)
 
 ==============================================================
 
+numa_stat
+
+This interface allows runtime configuration of numa statistics.
+
+When page allocation performance becomes a bottleneck and you can tolerate
+some possible tool breakage and decreased numa counter precision, you can
+do:
+	echo 0 > /proc/sys/vm/numa_stat
+
+When page allocation performance is not a bottleneck and you want all
+tooling to work, you can do:
+	echo 1 > /proc/sys/vm/numa_stat
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1e0cb72e0598..1779c9817b39 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -7,9 +7,19 @@
 #include <linux/mmzone.h>
 #include <linux/vm_event_item.h>
 #include <linux/atomic.h>
+#include <linux/static_key.h>
 
 extern int sysctl_stat_interval;
 
+#ifdef CONFIG_NUMA
+#define ENABLE_NUMA_STAT   1
+#define DISABLE_NUMA_STAT   0
+extern int sysctl_vm_numa_stat;
+DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
+extern int sysctl_vm_numa_stat_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *length, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7638e2f7fff8..4a13a389e99b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
 	},
+	{
+		.procname		= "numa_stat",
+		.data			= &sysctl_vm_numa_stat,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= sysctl_vm_numa_stat_handler,
+		.extra1			= &zero,
+		.extra2			= &one,
+	},
 #endif
 	 {
 		.procname	= "hugetlb_shm_group",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index dad166b736ba..4ce44d3ff03d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1915,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 	struct page *page;
 
 	page = __alloc_pages(gfp, order, nid);
+	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return page;
 	if (page && page_to_nid(page) == nid) {
 		preempt_disable();
 		__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ca668e946e5..67f523c4711a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -82,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
 #endif
 
+DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
+
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
 /*
  * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
@@ -2777,6 +2779,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #ifdef CONFIG_NUMA
 	enum numa_stat_item local_stat = NUMA_LOCAL;
 
+	/* skip numa counters update if numa stats is disabled */
+	if (!static_branch_likely(&vm_numa_stat_key))
+		return;
+
 	if (z->node != numa_node_id())
 		local_stat = NUMA_OTHER;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7d11554861e4..40b2db6db6b1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -32,6 +32,77 @@
 
 #define NUMA_STATS_THRESHOLD (U16_MAX - 2)
 
+#ifdef CONFIG_NUMA
+int sysctl_vm_numa_stat = ENABLE_NUMA_STAT;
+
+/* zero numa counters within a zone */
+static void zero_zone_numa_counters(struct zone *zone)
+{
+	int item, cpu;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) {
+		atomic_long_set(&zone->vm_numa_stat[item], 0);
+		for_each_online_cpu(cpu)
+			per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]
+						= 0;
+	}
+}
+
+/* zero numa counters of all the populated zones */
+static void zero_zones_numa_counters(void)
+{
+	struct zone *zone;
+
+	for_each_populated_zone(zone)
+		zero_zone_numa_counters(zone);
+}
+
+/* zero global numa counters */
+static void zero_global_numa_counters(void)
+{
+	int item;
+
+	for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++)
+		atomic_long_set(&vm_numa_stat[item], 0);
+}
+
+static void invalid_numa_statistics(void)
+{
+	zero_zones_numa_counters();
+	zero_global_numa_counters();
+}
+
+static DEFINE_MUTEX(vm_numa_stat_lock);
+
+int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret, oldval;
+
+	mutex_lock(&vm_numa_stat_lock);
+	if (write)
+		oldval = sysctl_vm_numa_stat;
+	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+	if (ret || !write)
+		goto out;
+
+	if (oldval == sysctl_vm_numa_stat)
+		goto out;
+	else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) {
+		static_branch_enable(&vm_numa_stat_key);
+		pr_info("enable numa statistics\n");
+	} else {
+		static_branch_disable(&vm_numa_stat_key);
+		invalid_numa_statistics();
+		pr_info("disable numa statistics, and clear numa counters\n");
+	}
+
+out:
+	mutex_unlock(&vm_numa_stat_lock);
+	return ret;
+}
+#endif
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-- 
cgit v1.3-14-g43fede


From b1fca27d384e8418aac84b39f6f5179aecc1b64f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:03 -0800
Subject: kernel debug: support resetting WARN*_ONCE

I like _ONCE warnings because it's guaranteed that they don't flood the
log.

During testing I find it useful to reset the state of the once warnings,
so that I can rerun tests and see if they trigger again, or can
guarantee that a test run always hits the same warnings.

This patch adds a debugfs interface to reset all the _ONCE warnings so
that they appear again:

  echo 1 > /sys/kernel/debug/clear_warn_once

This is implemented by putting all the warning booleans into a special
section, and clearing it.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20171017221455.6740-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/clearing-warn-once.txt |  7 +++++++
 include/asm-generic/bug.h            |  6 +++---
 include/asm-generic/sections.h       |  1 +
 include/asm-generic/vmlinux.lds.h    |  3 +++
 kernel/panic.c                       | 28 ++++++++++++++++++++++++++++
 5 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/clearing-warn-once.txt

(limited to 'kernel')

diff --git a/Documentation/clearing-warn-once.txt b/Documentation/clearing-warn-once.txt
new file mode 100644
index 000000000000..5b1f5d547be1
--- /dev/null
+++ b/Documentation/clearing-warn-once.txt
@@ -0,0 +1,7 @@
+
+WARN_ONCE / WARN_ON_ONCE only print a warning once.
+
+echo 1 > /sys/kernel/debug/clear_warn_once
+
+clears the state and allows the warnings to print once again.
+This can be useful after test suite runs to reproduce problems.
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index af2cc94a61bf..7844b0df88cd 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -130,7 +130,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 
 #ifndef WARN_ON_ONCE
 #define WARN_ON_ONCE(condition)	({				\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -142,7 +142,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 #endif
 
 #define WARN_ONCE(condition, format...)	({			\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
@@ -153,7 +153,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 })
 
 #define WARN_TAINT_ONCE(condition, taint, format...)	({	\
-	static bool __section(.data.unlikely) __warned;		\
+	static bool __section(.data.once) __warned;		\
 	int __ret_warn_once = !!(condition);			\
 								\
 	if (unlikely(__ret_warn_once && !__warned)) {		\
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 6d9576931084..03cc5f9bba71 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -44,6 +44,7 @@ extern char __entry_text_start[], __entry_text_end[];
 extern char __start_rodata[], __end_rodata[];
 extern char __irqentry_text_start[], __irqentry_text_end[];
 extern char __softirqentry_text_start[], __softirqentry_text_end[];
+extern char __start_once[], __end_once[];
 
 /* Start and end of .ctors section - used for constructor calls. */
 extern char __ctors_start[], __ctors_end[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index bdcd1caae092..ee8b707d9fa9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -223,6 +223,9 @@
 	MEM_KEEP(init.data)						\
 	MEM_KEEP(exit.data)						\
 	*(.data.unlikely)						\
+	VMLINUX_SYMBOL(__start_once) = .;				\
+	*(.data.once)							\
+	VMLINUX_SYMBOL(__end_once) = .;					\
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..672a91dc20fe 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -587,6 +589,32 @@ void warn_slowpath_null(const char *file, int line)
 EXPORT_SYMBOL(warn_slowpath_null);
 #endif
 
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+	memset(__start_once, 0, __end_once - __start_once);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+			NULL,
+			clear_warn_once_set,
+			"%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+	/* Don't care about failure */
+	debugfs_create_file("clear_warn_once", 0644, NULL,
+			    NULL, &clear_warn_once_fops);
+	return 0;
+}
+
+device_initcall(register_warn_debugfs);
+#endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 
 /*
-- 
cgit v1.3-14-g43fede


From aaf5dcfb223617ac2d16113e4b500199c65689de Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 17 Nov 2017 15:27:06 -0800
Subject: kernel debug: support resetting WARN_ONCE for all architectures

Some architectures store the WARN_ONCE state in the flags field of the
bug_entry.  Clear that one too when resetting once state through
/sys/kernel/debug/clear_warn_once

Pointed out by Michael Ellerman

Improves the earlier patch that add clear_warn_once.

[ak@linux.intel.com: add a missing ifdef CONFIG_MODULES]
  Link: http://lkml.kernel.org/r/20171020170633.9593-1-andi@firstfloor.org
[akpm@linux-foundation.org: fix unused var warning]
[akpm@linux-foundation.org: Use 0200 for clear_warn_once file, per mpe]
[akpm@linux-foundation.org: clear BUGFLAG_DONE in clear_once_table(), per mpe]
Link: http://lkml.kernel.org/r/20171019204642.7404-1-andi@firstfloor.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bug.h |  5 +++++
 kernel/panic.c      |  3 ++-
 lib/bug.c           | 23 +++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bug.h b/include/linux/bug.h
index da4231c905c8..fe5916550da8 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -43,6 +43,8 @@ enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs);
 /* These are defined by the architecture */
 int is_valid_bugaddr(unsigned long addr);
 
+void generic_bug_clear_once(void);
+
 #else	/* !CONFIG_GENERIC_BUG */
 
 static inline enum bug_trap_type report_bug(unsigned long bug_addr,
@@ -51,6 +53,9 @@ static inline enum bug_trap_type report_bug(unsigned long bug_addr,
 	return BUG_TRAP_TYPE_BUG;
 }
 
+
+static inline void generic_bug_clear_once(void) {}
+
 #endif	/* CONFIG_GENERIC_BUG */
 
 /*
diff --git a/kernel/panic.c b/kernel/panic.c
index 672a91dc20fe..67cebf2a3b67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -595,6 +595,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
 
 static int clear_warn_once_set(void *data, u64 val)
 {
+	generic_bug_clear_once();
 	memset(__start_once, 0, __end_once - __start_once);
 	return 0;
 }
@@ -607,7 +608,7 @@ DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
 static __init int register_warn_debugfs(void)
 {
 	/* Don't care about failure */
-	debugfs_create_file("clear_warn_once", 0644, NULL,
+	debugfs_create_file("clear_warn_once", 0200, NULL,
 			    NULL, &clear_warn_once_fops);
 	return 0;
 }
diff --git a/lib/bug.c b/lib/bug.c
index 1e094408c893..f66be6bf6206 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -196,3 +196,26 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 
 	return BUG_TRAP_TYPE_BUG;
 }
+
+static void clear_once_table(struct bug_entry *start, struct bug_entry *end)
+{
+	struct bug_entry *bug;
+
+	for (bug = start; bug < end; bug++)
+		bug->flags &= ~BUGFLAG_DONE;
+}
+
+void generic_bug_clear_once(void)
+{
+#ifdef CONFIG_MODULES
+	struct module *mod;
+
+	rcu_read_lock_sched();
+	list_for_each_entry_rcu(mod, &module_bug_list, bug_list)
+		clear_once_table(mod->bug_table,
+				 mod->bug_table + mod->num_bugs);
+	rcu_read_unlock_sched();
+#endif
+
+	clear_once_table(__start___bug_table, __stop___bug_table);
+}
-- 
cgit v1.3-14-g43fede


From 2a8358d8a339540f00ec596526690e8eeca931a3 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:21 -0800
Subject: bug: define the "cut here" string in a single place

The "cut here" string is used in a few paths.  Define it in a single
place.

Link: http://lkml.kernel.org/r/1510100869-73751-3-git-send-email-keescook@chromium.org
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/mm/fault.c   | 2 +-
 include/asm-generic/bug.h | 2 ++
 kernel/panic.c            | 2 +-
 lib/bug.c                 | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index f23781d6bbb3..f0bfa1448744 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -60,7 +60,7 @@ void bust_spinlocks(int yes)
 void do_BUG(const char *file, int line)
 {
 	bust_spinlocks(1);
-	printk(KERN_EMERG "------------[ cut here ]------------\n");
+	printk(KERN_EMERG CUT_HERE);
 	printk(KERN_EMERG "kernel BUG at %s:%d!\n", file, line);
 }
 
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 7844b0df88cd..1283473f234e 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -4,6 +4,8 @@
 
 #include <linux/compiler.h>
 
+#define CUT_HERE		"------------[ cut here ]------------\n"
+
 #ifdef CONFIG_GENERIC_BUG
 #define BUGFLAG_WARNING		(1 << 0)
 #define BUGFLAG_ONCE		(1 << 1)
diff --git a/kernel/panic.c b/kernel/panic.c
index 67cebf2a3b67..89df5fa2f798 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn("------------[ cut here ]------------\n");
+	pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
diff --git a/lib/bug.c b/lib/bug.c
index f66be6bf6206..c1b0fad31b10 100644
--- a/lib/bug.c
+++ b/lib/bug.c
@@ -186,7 +186,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs)
 		return BUG_TRAP_TYPE_WARN;
 	}
 
-	printk(KERN_DEFAULT "------------[ cut here ]------------\n");
+	printk(KERN_DEFAULT CUT_HERE);
 
 	if (file)
 		pr_crit("kernel BUG at %s:%u!\n", file, line);
-- 
cgit v1.3-14-g43fede


From a7bed27af194aa3f67915688039d93188ed95e2a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 17 Nov 2017 15:27:24 -0800
Subject: bug: fix "cut here" location for __WARN_TAINT architectures

Prior to v4.11, x86 used warn_slowpath_fmt() for handling WARN()s.
After WARN() was moved to using UD0 on x86, the warning text started
appearing _before_ the "cut here" line.  This appears to have been a
long-standing bug on architectures that used __WARN_TAINT, but it didn't
get fixed.

v4.11 and earlier on x86:

  ------------[ cut here ]------------
  WARNING: CPU: 0 PID: 2956 at drivers/misc/lkdtm_bugs.c:65 lkdtm_WARNING+0x21/0x30
  This is a warning message
  Modules linked in:

v4.12 and later on x86:

  This is a warning message
  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 2982 at drivers/misc/lkdtm_bugs.c:68 lkdtm_WARNING+0x15/0x20
  Modules linked in:

With this fix:

  ------------[ cut here ]------------
  This is a warning message
  WARNING: CPU: 3 PID: 3009 at drivers/misc/lkdtm_bugs.c:67 lkdtm_WARNING+0x15/0x20

Since the __FILE__ reporting happens as part of the UD0 handler, it
isn't trivial to move the message to after the WARNING line, but at
least we can fix the position of the "cut here" line so all the various
logging tools will start including the actual runtime warning message
again, when they follow the instruction and "cut here".

Link: http://lkml.kernel.org/r/1510100869-73751-4-git-send-email-keescook@chromium.org
Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0")
Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/bug.h |  5 +++--
 kernel/panic.c            | 16 +++++++++++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index 1283473f234e..963b755d19b0 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -92,10 +92,11 @@ extern void warn_slowpath_null(const char *file, const int line);
 #define __WARN_printf_taint(taint, arg...)				\
 	warn_slowpath_fmt_taint(__FILE__, __LINE__, taint, arg)
 #else
+extern __printf(1, 2) void __warn_printk(const char *fmt, ...);
 #define __WARN()		__WARN_TAINT(TAINT_WARN)
-#define __WARN_printf(arg...)	do { printk(arg); __WARN(); } while (0)
+#define __WARN_printf(arg...)	do { __warn_printk(arg); __WARN(); } while (0)
 #define __WARN_printf_taint(taint, arg...)				\
-	do { printk(arg); __WARN_TAINT(taint); } while (0)
+	do { __warn_printk(arg); __WARN_TAINT(taint); } while (0)
 #endif
 
 /* used internally by panic.c */
diff --git a/kernel/panic.c b/kernel/panic.c
index 89df5fa2f798..3242b64b1956 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -520,7 +520,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn(CUT_HERE);
+	if (args)
+		pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -584,9 +585,22 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
 
 void warn_slowpath_null(const char *file, int line)
 {
+	pr_warn(CUT_HERE);
 	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+	va_list args;
+
+	pr_warn(CUT_HERE);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
 #endif
 
 #ifdef CONFIG_BUG
-- 
cgit v1.3-14-g43fede


From 8c703d660450c4df72ff24f79a335dc7973a9dc8 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Fri, 17 Nov 2017 15:27:32 -0800
Subject: kernel/umh.c: optimize 'proc_cap_handler()'

If 'write' is 0, we can avoid a call to spin_lock/spin_unlock.

Link: http://lkml.kernel.org/r/20171020193331.7233-1-christophe.jaillet@wanadoo.fr
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Luis R. Rodriguez <mcgrof@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/umh.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	/*
 	 * Drop everything not in the new_cap (but don't add things)
 	 */
-	spin_lock(&umh_sysctl_lock);
 	if (write) {
+		spin_lock(&umh_sysctl_lock);
 		if (table->data == CAP_BSET)
 			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
 		if (table->data == CAP_PI)
 			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+		spin_unlock(&umh_sysctl_lock);
 	}
-	spin_unlock(&umh_sysctl_lock);
 
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From 98159d977f71c3b3dee898d1c34e56f520b094e7 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:17 -0800
Subject: pipe: match pipe_max_size data type with procfs

Patch series "A few round_pipe_size() and pipe-max-size fixups", v3.

While backporting Michael's "pipe: fix limit handling" patchset to a
distro-kernel, Mikulas noticed that current upstream pipe limit handling
contains a few problems:

  1 - procfs signed wrap: echo'ing a large number into
      /proc/sys/fs/pipe-max-size and then cat'ing it back out shows a
      negative value.

  2 - round_pipe_size() nr_pages overflow on 32bit:  this would
      subsequently try roundup_pow_of_two(0), which is undefined.

  3 - visible non-rounded pipe-max-size value: there is no mutual
      exclusion or protection between the time pipe_max_size is assigned
      a raw value from proc_dointvec_minmax() and when it is rounded.

  4 - unsigned long -> unsigned int conversion makes for potential odd
      return errors from do_proc_douintvec_minmax_conv() and
      do_proc_dopipe_max_size_conv().

This version underwent the same testing as v1:
https://marc.info/?l=linux-kernel&m=150643571406022&w=2

This patch (of 4):

pipe_max_size is defined as an unsigned int:

  unsigned int pipe_max_size = 1048576;

but its procfs/sysctl representation is an integer:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

that is signed:

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)

This leads to signed results via procfs for large values of pipe_max_size:

  % echo 2147483647 >/proc/sys/fs/pipe-max-size
  % cat /proc/sys/fs/pipe-max-size
  -2147483648

Use unsigned operations on this variable to avoid such negative values.

Link: http://lkml.kernel.org/r/1507658689-11669-2-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c       | 2 +-
 kernel/sysctl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index 349c9d56d4b3..3909c55ed389 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1125,7 +1125,7 @@ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 {
 	int ret;
 
-	ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
 	if (ret < 0 || !write)
 		return ret;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4a13a389e99b..2d42183b4c98 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1816,7 +1816,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
-- 
cgit v1.3-14-g43fede


From 7a8d181949fb2c16be00f8cdb354794a30e46b39 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:24 -0800
Subject: pipe: add proc_dopipe_max_size() to safely assign pipe_max_size

pipe_max_size is assigned directly via procfs sysctl:

  static struct ctl_table fs_table[] = {
          ...
          {
                  .procname       = "pipe-max-size",
                  .data           = &pipe_max_size,
                  .maxlen         = sizeof(int),
                  .mode           = 0644,
                  .proc_handler   = &pipe_proc_fn,
                  .extra1         = &pipe_min_size,
          },
          ...

  int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
                   size_t *lenp, loff_t *ppos)
  {
          ...
          ret = proc_dointvec_minmax(table, write, buf, lenp, ppos)
          ...

and then later rounded in-place a few statements later:

          ...
          pipe_max_size = round_pipe_size(pipe_max_size);
          ...

This leaves a window of time between initial assignment and rounding
that may be visible to other threads.  (For example, one thread sets a
non-rounded value to pipe_max_size while another reads its value.)

Similar reads of pipe_max_size are potentially racy:

  pipe.c :: alloc_pipe_info()
  pipe.c :: pipe_set_size()

Add a new proc_dopipe_max_size() that consolidates reading the new value
from the user buffer, verifying bounds, and calling round_pipe_size()
with a single assignment to pipe_max_size.

Link: http://lkml.kernel.org/r/1507658689-11669-4-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c                 | 18 +++--------------
 include/linux/pipe_fs_i.h |  1 +
 include/linux/sysctl.h    |  3 +++
 kernel/sysctl.c           | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/fs/pipe.c b/fs/pipe.c
index f0f4ab36c444..6d98566201ef 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1020,7 +1020,7 @@ const struct file_operations pipefifo_fops = {
  * Currently we rely on the pipe array holding a power-of-2 number
  * of pages. Returns 0 on error.
  */
-static inline unsigned int round_pipe_size(unsigned int size)
+unsigned int round_pipe_size(unsigned int size)
 {
 	unsigned long nr_pages;
 
@@ -1125,25 +1125,13 @@ out_revert_acct:
 }
 
 /*
- * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size
  * will return an error.
  */
 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
 		 size_t *lenp, loff_t *ppos)
 {
-	unsigned int rounded_pipe_max_size;
-	int ret;
-
-	ret = proc_douintvec_minmax(table, write, buf, lenp, ppos);
-	if (ret < 0 || !write)
-		return ret;
-
-	rounded_pipe_max_size = round_pipe_size(pipe_max_size);
-	if (rounded_pipe_max_size == 0)
-		return -EINVAL;
-
-	pipe_max_size = rounded_pipe_max_size;
-	return ret;
+	return proc_dopipe_max_size(table, write, buf, lenp, ppos);
 }
 
 /*
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 6a80cfc63e0c..2dc5e9870fcd 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -191,5 +191,6 @@ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
 struct pipe_inode_info *get_pipe_info(struct file *file);
 
 int create_pipe_files(struct file **, int);
+unsigned int round_pipe_size(unsigned int size);
 
 #endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd..992bc9948232 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -51,6 +51,9 @@ extern int proc_dointvec_minmax(struct ctl_table *, int,
 extern int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp,
 				 loff_t *ppos);
+extern int proc_dopipe_max_size(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
 				 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d42183b4c98..138b6484f277 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -66,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -2620,6 +2621,47 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+struct do_proc_dopipe_max_size_conv_param {
+	unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+					unsigned int *valp,
+					int write, void *data)
+{
+	struct do_proc_dopipe_max_size_conv_param *param = data;
+
+	if (write) {
+		unsigned int val = round_pipe_size(*lvalp);
+
+		if (val == 0)
+			return -EINVAL;
+
+		if (param->min && *param->min > val)
+			return -ERANGE;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dopipe_max_size_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_dopipe_max_size_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3125,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3168,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
-- 
cgit v1.3-14-g43fede


From fb910c42ccebf853c29296185c45c11164a56098 Mon Sep 17 00:00:00 2001
From: Joe Lawrence <joe.lawrence@redhat.com>
Date: Fri, 17 Nov 2017 15:29:28 -0800
Subject: sysctl: check for UINT_MAX before unsigned int min/max

Mikulas noticed in the existing do_proc_douintvec_minmax_conv() and
do_proc_dopipe_max_size_conv() introduced in this patchset, that they
inconsistently handle overflow and min/max range inputs:

For example:

  0 ... param->min - 1 ---> ERANGE
  param->min ... param->max ---> the value is accepted
  param->max + 1 ... 0x100000000L + param->min - 1 ---> ERANGE
  0x100000000L + param->min ... 0x100000000L + param->max ---> EINVAL
  0x100000000L + param->max + 1, 0x200000000L + param->min - 1 ---> ERANGE
  0x200000000L + param->min ... 0x200000000L + param->max ---> EINVAL
  0x200000000L + param->max + 1, 0x300000000L + param->min - 1 ---> ERANGE

In do_proc_do*() routines which store values into unsigned int variables
(4 bytes wide for 64-bit builds), first validate that the input unsigned
long value (8 bytes wide for 64-bit builds) will fit inside the smaller
unsigned int variable.  Then check that the unsigned int value falls
inside the specified parameter min, max range.  Otherwise the unsigned
long -> unsigned int conversion drops leading bits from the input value,
leading to the inconsistent pattern Mikulas documented above.

Link: http://lkml.kernel.org/r/1507658689-11669-5-git-send-email-joe.lawrence@redhat.com
Signed-off-by: Joe Lawrence <joe.lawrence@redhat.com>
Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 138b6484f277..dd25d90896fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2576,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
 	if (write) {
 		unsigned int val = *lvalp;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
 		if ((param->min && *param->min > val) ||
 		    (param->max && *param->max < val))
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
@@ -2632,17 +2633,18 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
 	struct do_proc_dopipe_max_size_conv_param *param = data;
 
 	if (write) {
-		unsigned int val = round_pipe_size(*lvalp);
+		unsigned int val;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		val = round_pipe_size(*lvalp);
 		if (val == 0)
 			return -EINVAL;
 
 		if (param->min && *param->min > val)
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
-
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
-- 
cgit v1.3-14-g43fede


From 628c1bcba204052d19b686b5bac149a644cdb72e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:01 -0800
Subject: kernel/signal.c: protect the traced SIGNAL_UNKILLABLE tasks from
 SIGKILL

The comment in sig_ignored() says "Tracers may want to know about even
ignored signals" but SIGKILL can not be reported to debugger and it is
just wrong to return 0 in this case: SIGKILL should only kill the
SIGNAL_UNKILLABLE task if it comes from the parent ns.

Change sig_ignored() to ignore ->ptrace if sig == SIGKILL and rely on
sig_task_ignored().

SISGTOP coming from within the namespace is not really right too but at
least debugger can intercept it, and we can't drop it here because this
will break "gdb -p 1": ptrace_attach() won't work.  Perhaps we will add
another ->ptrace check later, we will see.

Link: http://lkml.kernel.org/r/20171103184206.GB21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index aa1fb9f905db..be5913134742 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, force))
-		return 0;
-
 	/*
-	 * Tracers may want to know about even ignored signals.
+	 * Tracers may want to know about even ignored signal unless it
+	 * is SIGKILL which can't be reported anyway but can be ignored
+	 * by SIGNAL_UNKILLABLE task.
 	 */
-	return !t->ptrace;
+	if (t->ptrace && sig != SIGKILL)
+		return 0;
+
+	return sig_task_ignored(t, sig, force);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From ac25385089f673560867eb5179228a44ade0cfc1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:04 -0800
Subject: kernel/signal.c: protect the SIGNAL_UNKILLABLE tasks from
 !sig_kernel_only() signals

Change sig_task_ignored() to drop the SIG_DFL && !sig_kernel_only()
signals even if force == T.  This simplifies the next change and this
matches the same check in get_signal() which will drop these signals
anyway.

Link: http://lkml.kernel.org/r/20171103184227.GC21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index be5913134742..01ba166a5e3a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !force)
+	    handler == SIG_DFL && !(force && sig_kernel_only(sig)))
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
-- 
cgit v1.3-14-g43fede


From 426915796ccaf9c2bd9bb06dc5702225957bc2e5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 17 Nov 2017 15:30:08 -0800
Subject: kernel/signal.c: remove the no longer needed SIGNAL_UNKILLABLE check
 in complete_signal()

complete_signal() checks SIGNAL_UNKILLABLE before it starts to destroy
the thread group, today this is wrong in many ways.

If nothing else, fatal_signal_pending() should always imply that the
whole thread group (except ->group_exit_task if it is not NULL) is
killed, this check breaks the rule.

After the previous changes we can rely on sig_task_ignored();
sig_fatal(sig) && SIGNAL_UNKILLABLE can only be true if we actually want
to kill this task and sig == SIGKILL OR it is traced and debugger can
intercept the signal.

This should hopefully fix the problem reported by Dmitry.  This
test-case

	static int init(void *arg)
	{
		for (;;)
			pause();
	}

	int main(void)
	{
		char stack[16 * 1024];

		for (;;) {
			int pid = clone(init, stack + sizeof(stack)/2,
					CLONE_NEWPID | SIGCHLD, NULL);
			assert(pid > 0);

			assert(ptrace(PTRACE_ATTACH, pid, 0, 0) == 0);
			assert(waitpid(-1, NULL, WSTOPPED) == pid);

			assert(ptrace(PTRACE_DETACH, pid, 0, SIGSTOP) == 0);
			assert(syscall(__NR_tkill, pid, SIGKILL) == 0);
			assert(pid == wait(NULL));
		}
	}

triggers the WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING)) in
task_participate_group_stop().  do_signal_stop()->signal_group_exit()
checks SIGNAL_GROUP_EXIT and return false, but task_set_jobctl_pending()
checks fatal_signal_pending() and does not set JOBCTL_STOP_PENDING.

And his should fix the minor security problem reported by Kyle,
SECCOMP_RET_TRACE can miss fatal_signal_pending() the same way if the
task is the root of a pid namespace.

Link: http://lkml.kernel.org/r/20171103184246.GD21036@redhat.com
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Kyle Huey <me@kylehuey.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Tested-by: Kyle Huey <me@kylehuey.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 01ba166a5e3a..6895f6bb98a7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -931,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * then start taking the whole group down immediately.
 	 */
 	if (sig_fatal(p, sig) &&
-	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+	    !(signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !t->ptrace)) {
+	    (sig == SIGKILL || !p->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.3-14-g43fede


From de40ccefd1f19180c0a43e4d9b9d2f4dc8856c8b Mon Sep 17 00:00:00 2001
From: Dave Young <dyoung@redhat.com>
Date: Fri, 17 Nov 2017 15:30:12 -0800
Subject: kdump: print a message in case parse_crashkernel_mem resulted in zero
 bytes

parse_crashkernel_mem() silently returns if we get zero bytes in the
parsing function.  It is useful for debugging to add a message,
especially if the kernel cannot boot correctly.

Add a pr_info instead of pr_warn because it is expected behavior for
size = 0, eg.  crashkernel=2G-4G:128M, size will be 0 in case system
memory is less than 2G.

Link: http://lkml.kernel.org/r/20171114080129.GA6115@dhcp-128-65.nay.redhat.com
Signed-off-by: Dave Young <dyoung@redhat.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Bhupesh Sharma <bhsharma@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/crash_core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
 				return -EINVAL;
 			}
 		}
-	}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
 
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From f9eb2fdd04d4e68fbea18970bbf65ace716d25b6 Mon Sep 17 00:00:00 2001
From: "Ola N. Kaldestad" <mail@okal.no>
Date: Fri, 17 Nov 2017 15:30:26 -0800
Subject: kernel/sysctl.c: code cleanups

Remove unnecessary else block, remove redundant return and call to kfree
in if block.

Link: http://lkml.kernel.org/r/1510238435-1655-1-git-send-email-mail@okal.no
Signed-off-by: Ola N. Kaldestad <mail@okal.no>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index dd25d90896fc..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3127,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			else
 				bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
 		}
-		kfree(tmp_bitmap);
 		*lenp -= left;
 		*ppos += *lenp;
-		return 0;
-	} else {
-		kfree(tmp_bitmap);
-		return err;
 	}
+
+	kfree(tmp_bitmap);
+	return err;
 }
 
 #else /* CONFIG_PROC_SYSCTL */
-- 
cgit v1.3-14-g43fede


From 95846ecf9dac5089aed4b144d912225f8ef86ae4 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:30 -0800
Subject: pid: replace pid bitmap implementation with IDR API

Patch series "Replacing PID bitmap implementation with IDR API", v4.

This series replaces kernel bitmap implementation of PID allocation with
IDR API.  These patches are written to simplify the kernel by replacing
custom code with calls to generic code.

The following are the stats for pid and pid_namespace object files
before and after the replacement.  There is a noteworthy change between
the IDR and bitmap implementation.

Before
   text       data        bss        dec        hex    filename
   8447       3894         64      12405       3075    kernel/pid.o
After
   text       data        bss        dec        hex    filename
   3397        304          0       3701        e75    kernel/pid.o

Before
   text       data        bss        dec        hex    filename
   5692       1842        192       7726       1e2e    kernel/pid_namespace.o
After
   text       data        bss        dec        hex    filename
   2854        216         16       3086        c0e    kernel/pid_namespace.o

The following are the stats for ps, pstree and calling readdir on /proc
for 10,000 processes.

ps:
        With IDR API    With bitmap
real    0m1.479s        0m2.319s
user    0m0.070s        0m0.060s
sys     0m0.289s        0m0.516s

pstree:
        With IDR API    With bitmap
real    0m1.024s        0m1.794s
user    0m0.348s        0m0.612s
sys     0m0.184s        0m0.264s

proc:
        With IDR API    With bitmap
real    0m0.059s        0m0.074s
user    0m0.000s        0m0.004s
sys     0m0.016s        0m0.016s

This patch (of 2):

Replace the current bitmap implementation for Process ID allocation.
Functions that are no longer required, for example, free_pidmap(),
alloc_pidmap(), etc.  are removed.  The rest of the functions are
modified to use the IDR API.  The change was made to make the PID
allocation less complex by replacing custom code with calls to generic
API.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-2-git-send-email-gs051095@gmail.com
[avagin@openvz.org: restore the old behaviour of the ns_last_pid sysctl]
  Link: http://lkml.kernel.org/r/20171106183144.16368-1-avagin@openvz.org
Link: http://lkml.kernel.org/r/1507583624-22146-2-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/cell/spufs/sched.c |   2 +-
 fs/proc/loadavg.c                         |   2 +-
 include/linux/pid_namespace.h             |  14 +--
 init/main.c                               |   2 +-
 kernel/pid.c                              | 201 ++++++------------------------
 kernel/pid_namespace.c                    |  53 ++++----
 6 files changed, 65 insertions(+), 209 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 1fbb5da17dd2..e47761cdcb98 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1093,7 +1093,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
 		LOAD_INT(c), LOAD_FRAC(c),
 		count_active_contexts(),
 		atomic_read(&nr_spu_contexts),
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bc5c58c00ee..a000d7547479 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
 		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
-		task_active_pid_ns(current)->last_pid);
+		idr_get_cursor(&task_active_pid_ns(current)->idr));
 	return 0;
 }
 
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index c78af6061644..92c6aa509d2e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -10,15 +10,8 @@
 #include <linux/nsproxy.h>
 #include <linux/kref.h>
 #include <linux/ns_common.h>
+#include <linux/idr.h>
 
-struct pidmap {
-       atomic_t nr_free;
-       void *page;
-};
-
-#define BITS_PER_PAGE		(PAGE_SIZE * 8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
 struct fs_pin;
 
@@ -30,9 +23,8 @@ enum { /* definitions for pid_namespace's hide_pid field */
 
 struct pid_namespace {
 	struct kref kref;
-	struct pidmap pidmap[PIDMAP_ENTRIES];
+	struct idr idr;
 	struct rcu_head rcu;
-	int last_pid;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -106,6 +98,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
 
 extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
 void pidhash_init(void);
-void pidmap_init(void);
+void pid_idr_init(void);
 
 #endif /* _LINUX_PID_NS_H */
diff --git a/init/main.c b/init/main.c
index 859a786f7c0a..d0cbcfc06124 100644
--- a/init/main.c
+++ b/init/main.c
@@ -669,7 +669,7 @@ asmlinkage __visible void __init start_kernel(void)
 	if (late_time_init)
 		late_time_init();
 	calibrate_delay();
-	pidmap_init();
+	pid_idr_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled(EFI_RUNTIME_SERVICES))
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..0ce59369632f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,6 +39,7 @@
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #include <linux/sched/task.h>
+#include <linux/idr.h>
 
 #define pid_hashfn(nr, ns)	\
 	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
@@ -53,14 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-static inline int mk_pid(struct pid_namespace *pid_ns,
-		struct pidmap *map, int off)
-{
-	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off)					\
-		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -70,10 +63,7 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.pidmap = {
-		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
-	},
-	.last_pid = 0,
+	.idr = IDR_INIT,
 	.nr_hashed = PIDNS_HASH_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
@@ -101,138 +91,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
-{
-	int nr = upid->nr;
-	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
-	int offset = nr & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
-	/*
-	 * This is the same as saying
-	 *
-	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
-	 * and that mapping orders 'a' and 'b' with respect to 'base'.
-	 */
-	return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value.  We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
-	int prev;
-	int last_write = base;
-	do {
-		prev = last_write;
-		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
-	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-	struct pidmap *map;
-
-	pid = last + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	/*
-	 * If last_pid points into the middle of the map->page we
-	 * want to scan this bitmap block twice, the second time
-	 * we start with offset == 0 (or RESERVED_PIDS).
-	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
-	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (!map->page) {
-				map->page = page;
-				page = NULL;
-			}
-			spin_unlock_irq(&pidmap_lock);
-			kfree(page);
-			if (unlikely(!map->page))
-				return -ENOMEM;
-		}
-		if (likely(atomic_read(&map->nr_free))) {
-			for ( ; ; ) {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->nr_free);
-					set_last_pid(pid_ns, last, pid);
-					return pid;
-				}
-				offset = find_next_offset(map, offset);
-				if (offset >= BITS_PER_PAGE)
-					break;
-				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
-					break;
-			}
-		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &pid_ns->pidmap[0];
-			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
-				break;
-		}
-		pid = mk_pid(pid_ns, map, offset);
-	}
-	return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
-	int offset;
-	struct pidmap *map, *end;
-
-	if (last >= PID_MAX_LIMIT)
-		return -1;
-
-	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-	for (; map < end; map++, offset = 0) {
-		if (unlikely(!map->page))
-			continue;
-		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
-		if (offset < BITS_PER_PAGE)
-			return mk_pid(pid_ns, map, offset);
-	}
-	return -1;
-}
-
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
@@ -266,7 +124,7 @@ void free_pid(struct pid *pid)
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
 		hlist_del_rcu(&upid->pid_chain);
-		switch(--ns->nr_hashed) {
+		switch (--ns->nr_hashed) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -284,12 +142,11 @@ void free_pid(struct pid *pid)
 			schedule_work(&ns->proc_work);
 			break;
 		}
+
+		idr_remove(&ns->idr, upid->nr);
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers + i);
-
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -308,8 +165,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	pid->level = ns->level;
+
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		int pid_min = 1;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&pidmap_lock);
+
+		/*
+		 * init really needs pid 1, but after reaching the maximum
+		 * wrap back to RESERVED_PIDS
+		 */
+		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+			pid_min = RESERVED_PIDS;
+
+		/*
+		 * Store a null pointer so find_pid_ns does not find
+		 * a partially initialized PID (see below).
+		 */
+		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+				      pid_max, GFP_ATOMIC);
+		spin_unlock_irq(&pidmap_lock);
+		idr_preload_end();
+
 		if (nr < 0) {
 			retval = nr;
 			goto out_free;
@@ -339,6 +217,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	for ( ; upid >= pid->numbers; --upid) {
 		hlist_add_head_rcu(&upid->pid_chain,
 				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
+		/* Make the PID visible to find_pid_ns. */
+		idr_replace(&upid->ns->idr, pid, upid->nr);
 		upid->ns->nr_hashed++;
 	}
 	spin_unlock_irq(&pidmap_lock);
@@ -350,8 +230,11 @@ out_unlock:
 	put_pid_ns(ns);
 
 out_free:
+	spin_lock_irq(&pidmap_lock);
 	while (++i <= ns->level)
-		free_pidmap(pid->numbers + i);
+		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	return ERR_PTR(retval);
@@ -553,16 +436,7 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-	struct pid *pid;
-
-	do {
-		pid = find_pid_ns(nr, ns);
-		if (pid)
-			break;
-		nr = next_pidmap(ns, nr);
-	} while (nr > 0);
-
-	return pid;
+	return idr_get_next(&ns->idr, &nr);
 }
 
 /*
@@ -578,7 +452,7 @@ void __init pidhash_init(void)
 					   0, 4096);
 }
 
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
@@ -590,10 +464,7 @@ void __init pidmap_init(void)
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
-	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pid_ns.pidmap[0].page);
-	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	idr_init(&init_pid_ns.idr);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..ca7c8a8823b1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/idr.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
 	struct ucounts *ucounts;
-	int i;
 	int err;
 
 	err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns == NULL)
 		goto out_dec;
 
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
+	idr_init(&ns->idr);
 
 	ns->pid_cachep = create_pid_cachep(level + 1);
 	if (ns->pid_cachep == NULL)
-		goto out_free_map;
+		goto out_free_idr;
 
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto out_free_map;
+		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
 	kref_init(&ns->kref);
@@ -138,17 +136,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++)
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
 	return ns;
 
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
 out_dec:
 	dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	int i;
-
 	ns_free_inum(&ns->ns);
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
+
+	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int rc;
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
+	struct pid *pid;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * 	  maintain a tasklist for each pid namespace.
 	 *
 	 */
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		rcu_read_lock();
-
-		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+	nr = 2;
+	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+		task = pid_task(pid, PIDTYPE_PID);
 		if (task && !__fatal_signal_pending(task))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
-		rcu_read_unlock();
-
-		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
+	int ret, next;
 
 	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &pid_ns->last_pid;
-	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	next = idr_get_cursor(&pid_ns->idr) - 1;
+
+	tmp.data = &next;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (!ret && write)
+		idr_set_cursor(&pid_ns->idr, next + 1);
+
+	return ret;
 }
 
 extern int pid_max;
-- 
cgit v1.3-14-g43fede


From e8cfbc245e24887e3c30235f71e9e9405e0cfc39 Mon Sep 17 00:00:00 2001
From: Gargi Sharma <gs051095@gmail.com>
Date: Fri, 17 Nov 2017 15:30:34 -0800
Subject: pid: remove pidhash

pidhash is no longer required as all the information can be looked up
from idr tree.  nr_hashed represented the number of pids that had been
hashed.  Since, nr_hashed and PIDNS_HASH_ADDING are no longer relevant,
it has been renamed to pid_allocated and PIDNS_ADDING respectively.

[gs051095@gmail.com: v6]
  Link: http://lkml.kernel.org/r/1507760379-21662-3-git-send-email-gs051095@gmail.com
Link: http://lkml.kernel.org/r/1507583624-22146-3-git-send-email-gs051095@gmail.com
Signed-off-by: Gargi Sharma <gs051095@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Tested-by: Tony Luck <tony.luck@intel.com>	[ia64]
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Pavel Tatashin <pasha.tatashin@oracle.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/asm-offsets.c |  4 ++--
 include/linux/init_task.h      |  1 -
 include/linux/pid.h            |  2 --
 include/linux/pid_namespace.h  |  4 ++--
 init/main.c                    |  1 -
 kernel/fork.c                  |  2 +-
 kernel/pid.c                   | 48 +++++++++---------------------------------
 kernel/pid_namespace.c         |  6 +++---
 8 files changed, 18 insertions(+), 50 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index f7693f49c573..f4db2168d1b8 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -31,8 +31,8 @@ void foo(void)
 	DEFINE(SIGFRAME_SIZE, sizeof (struct sigframe));
 	DEFINE(UNW_FRAME_INFO_SIZE, sizeof (struct unw_frame_info));
 
-	BUILD_BUG_ON(sizeof(struct upid) != 32);
-	DEFINE(IA64_UPID_SHIFT, 5);
+	BUILD_BUG_ON(sizeof(struct upid) != 16);
+	DEFINE(IA64_UPID_SHIFT, 4);
 
 	BLANK();
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 8062e6cc607c..6a532629c983 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -105,7 +105,6 @@ extern struct group_info init_groups;
 	.numbers	= { {						\
 		.nr		= 0,					\
 		.ns		= &init_pid_ns,				\
-		.pid_chain	= { .next = NULL, .pprev = NULL },	\
 	}, }								\
 }
 
diff --git a/include/linux/pid.h b/include/linux/pid.h
index dfd684ce0787..7633d55d9a24 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -51,10 +51,8 @@ enum pid_type
  */
 
 struct upid {
-	/* Try to keep pid_chain in the same cacheline as nr for find_vpid */
 	int nr;
 	struct pid_namespace *ns;
-	struct hlist_node pid_chain;
 };
 
 struct pid
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 92c6aa509d2e..49538b172483 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -25,7 +25,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct idr idr;
 	struct rcu_head rcu;
-	unsigned int nr_hashed;
+	unsigned int pid_allocated;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
 	unsigned int level;
@@ -49,7 +49,7 @@ struct pid_namespace {
 
 extern struct pid_namespace init_pid_ns;
 
-#define PIDNS_HASH_ADDING (1U << 31)
+#define PIDNS_ADDING (1U << 31)
 
 #ifdef CONFIG_PID_NS
 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
diff --git a/init/main.c b/init/main.c
index d0cbcfc06124..dfec3809e740 100644
--- a/init/main.c
+++ b/init/main.c
@@ -562,7 +562,6 @@ asmlinkage __visible void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	setup_log_buf(0);
-	pidhash_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
diff --git a/kernel/fork.c b/kernel/fork.c
index 4e55eedba8d6..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1871,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
diff --git a/kernel/pid.c b/kernel/pid.c
index 0ce59369632f..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -41,10 +41,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -54,7 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -64,7 +59,7 @@ int pid_max_max = PID_MAX_LIMIT;
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
 	.idr = IDR_INIT,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -123,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch (--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -133,10 +127,10 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
@@ -212,14 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
-		upid->ns->nr_hashed++;
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -243,21 +235,13 @@ out_free:
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -413,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -439,23 +424,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 	return idr_get_next(&ns->idr, &nr);
 }
 
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
-}
-
 void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index ca7c8a8823b1..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -133,7 +133,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
 	return ns;
@@ -254,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -268,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}
-- 
cgit v1.3-14-g43fede


From 4efb442cc12eb66535b7c7ed06005fd7889c1d77 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 17 Nov 2017 15:30:38 -0800
Subject: kernel/panic.c: add TAINT_AUX

This is the gist of a patch which we've been forward-porting in our
kernels for a long time now and it probably would make a good sense to
have such TAINT_AUX flag upstream which can be used by each distro etc,
how they see fit.  This way, we won't need to forward-port a distro-only
version indefinitely.

Add an auxiliary taint flag to be used by distros and others.  This
obviates the need to forward-port whatever internal solutions people
have in favor of a single flag which they can map arbitrarily to a
definition of their pleasing.

The "X" mnemonic could also mean eXternal, which would be taint from a
distro or something else but not the upstream kernel.  We will use it to
mark modules for which we don't provide support.  I.e., a really
eXternal module.

Link: http://lkml.kernel.org/r/20170911134533.dp5mtyku5bongx4c@pd.tnic
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: Jessica Yu <jeyu@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Takashi Iwai <tiwai@suse.de>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 3 ++-
 kernel/panic.c         | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 4b484ab9e163..ce51455e2adf 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -549,7 +549,8 @@ extern enum system_states {
 #define TAINT_UNSIGNED_MODULE		13
 #define TAINT_SOFTLOCKUP		14
 #define TAINT_LIVEPATCH			15
-#define TAINT_FLAGS_COUNT		16
+#define TAINT_AUX			16
+#define TAINT_FLAGS_COUNT		17
 
 struct taint_flag {
 	char c_true;	/* character printed when tainted */
diff --git a/kernel/panic.c b/kernel/panic.c
index 3242b64b1956..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
+	{ 'X', ' ', true },	/* TAINT_AUX */
 };
 
 /**
@@ -345,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
  *  'K' - Kernel has been live patched.
+ *  'X' - Auxiliary taint, for distros' use.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.3-14-g43fede


From fcf4edac049a8bca41658970292e2dfdbc9d5f62 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 17 Nov 2017 15:30:42 -0800
Subject: kcov: remove pointless current != NULL check

__sanitizer_cov_trace_pc() is a hot code, so it's worth to remove
pointless '!current' check.  Current is never NULL.

Link: http://lkml.kernel.org/r/20170929162221.32500-1-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kcov.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..d9f9fa9cacc6 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -62,7 +62,7 @@ void notrace __sanitizer_cov_trace_pc(void)
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
-	if (!t || !in_task())
+	if (!in_task())
 		return;
 	mode = READ_ONCE(t->kcov_mode);
 	if (mode == KCOV_MODE_TRACE) {
-- 
cgit v1.3-14-g43fede


From ded97d2c2b2c5f1dcced0bc57133f7753b037dfc Mon Sep 17 00:00:00 2001
From: Victor Chibotaru <tchibo@google.com>
Date: Fri, 17 Nov 2017 15:30:46 -0800
Subject: kcov: support comparison operands collection

Enables kcov to collect comparison operands from instrumented code.
This is done by using Clang's -fsanitize=trace-cmp instrumentation
(currently not available for GCC).

The comparison operands help a lot in fuzz testing.  E.g.  they are used
in Syzkaller to cover the interiors of conditional statements with way
less attempts and thus make previously unreachable code reachable.

To allow separate collection of coverage and comparison operands two
different work modes are implemented.  Mode selection is now done via a
KCOV_ENABLE ioctl call with corresponding argument value.

Link: http://lkml.kernel.org/r/20171011095459.70721-1-glider@google.com
Signed-off-by: Victor Chibotaru <tchibo@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Konovalov <andreyknvl@google.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Popov <alex.popov@linux.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: <syzkaller@googlegroups.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kcov.h      |  12 ++-
 include/uapi/linux/kcov.h |  24 ++++++
 kernel/kcov.c             | 214 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 211 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kcov.h b/include/linux/kcov.h
index f5d8ce4f4f86..3ecf6f5e3a5f 100644
--- a/include/linux/kcov.h
+++ b/include/linux/kcov.h
@@ -8,19 +8,23 @@ struct task_struct;
 
 #ifdef CONFIG_KCOV
 
-void kcov_task_init(struct task_struct *t);
-void kcov_task_exit(struct task_struct *t);
-
 enum kcov_mode {
 	/* Coverage collection is not enabled yet. */
 	KCOV_MODE_DISABLED = 0,
+	/* KCOV was initialized, but tracing mode hasn't been chosen yet. */
+	KCOV_MODE_INIT = 1,
 	/*
 	 * Tracing coverage collection mode.
 	 * Covered PCs are collected in a per-task buffer.
 	 */
-	KCOV_MODE_TRACE = 1,
+	KCOV_MODE_TRACE_PC = 2,
+	/* Collecting comparison operands mode. */
+	KCOV_MODE_TRACE_CMP = 3,
 };
 
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
 #else
 
 static inline void kcov_task_init(struct task_struct *t) {}
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
index 33eabbb8ada1..9529867717a8 100644
--- a/include/uapi/linux/kcov.h
+++ b/include/uapi/linux/kcov.h
@@ -8,4 +8,28 @@
 #define KCOV_ENABLE			_IO('c', 100)
 #define KCOV_DISABLE			_IO('c', 101)
 
+enum {
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 * In new KCOV version the mode is chosen by calling
+	 * ioctl(fd, KCOV_ENABLE, mode). In older versions the mode argument
+	 * was supposed to be 0 in such a call. So, for reasons of backward
+	 * compatibility, we have chosen the value KCOV_TRACE_PC to be 0.
+	 */
+	KCOV_TRACE_PC = 0,
+	/* Collecting comparison operands mode. */
+	KCOV_TRACE_CMP = 1,
+};
+
+/*
+ * The format for the types of collected comparisons.
+ *
+ * Bit 0 shows whether one of the arguments is a compile-time constant.
+ * Bits 1 & 2 contain log2 of the argument size, up to 8 bytes.
+ */
+#define KCOV_CMP_CONST          (1 << 0)
+#define KCOV_CMP_SIZE(n)        ((n) << 1)
+#define KCOV_CMP_MASK           KCOV_CMP_SIZE(3)
+
 #endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/kcov.c b/kernel/kcov.c
index d9f9fa9cacc6..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,51 +56,176 @@ struct kcov {
 	struct task_struct	*t;
 };
 
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	struct task_struct *t;
 	enum kcov_mode mode;
 
-	t = current;
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
 	if (!in_task())
-		return;
+		return false;
 	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
 
+static unsigned long canonicalize_ip(unsigned long ip)
+{
 #ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
+	ip -= kaslr_offset();
 #endif
+	return ip;
+}
 
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+		return;
+
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:
-- 
cgit v1.3-14-g43fede


From 2d8364bae4db144df75ba85e92d2b8619ba8eedc Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andrew.smirnov@gmail.com>
Date: Fri, 17 Nov 2017 15:30:57 -0800
Subject: kernel/reboot.c: add devm_register_reboot_notifier()

Add devm_* wrapper around register_reboot_notifier to simplify device
specific reboot notifier registration/unregistration.

[akpm@linux-foundation.org: move `struct device' forward decl to top-of-file]
Link: http://lkml.kernel.org/r/20170320171753.1705-1-andrew.smirnov@gmail.com
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/reboot.h |  4 ++++
 kernel/reboot.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d03da0eb95ca..e63799a6e895 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -6,6 +6,8 @@
 #include <linux/notifier.h>
 #include <uapi/linux/reboot.h>
 
+struct device;
+
 #define SYS_DOWN	0x0001	/* Notify of system down */
 #define SYS_RESTART	SYS_DOWN
 #define SYS_HALT	0x0002	/* Notify of system halt */
@@ -39,6 +41,8 @@ extern int reboot_force;
 extern int register_reboot_notifier(struct notifier_block *);
 extern int unregister_reboot_notifier(struct notifier_block *);
 
+extern int devm_register_reboot_notifier(struct device *, struct notifier_block *);
+
 extern int register_restart_handler(struct notifier_block *);
 extern int unregister_restart_handler(struct notifier_block *);
 extern void do_kernel_restart(char *cmd);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.
-- 
cgit v1.3-14-g43fede


From 13a9c48a85ccf1417b527975c0a12b47fbfaf625 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:51 -0800
Subject: bpf: offload: add comment warning developers about double destroy

Offload state may get destroyed either because the device for which
it was constructed is going away, or because the refcount of bpf
program itself has reached 0.  In both of those cases we will call
__bpf_prog_offload_destroy() to unlink the offload from the device.
We may in fact call it twice, which works just fine, but we should
make clear this is intended and caution others trying to extend the
function.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/offload.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 2816feb38be1..fd696d3dd429 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -85,6 +85,10 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
 	struct bpf_dev_offload *offload = prog->aux->offload;
 	struct netdev_bpf data = {};
 
+	/* Caution - if netdev is destroyed before the program, this function
+	 * will be called twice.
+	 */
+
 	data.offload.prog = prog;
 
 	if (offload->verifier_running)
-- 
cgit v1.3-14-g43fede


From 649f11dcd19a5f0d00fdbc760fbdccdd98e56a43 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:52 -0800
Subject: bpf: offload: limit offload to cls_bpf and xdp programs only

We are currently only allowing attachment of device-bound
cls_bpf and XDP programs.  Make this restriction explicit in
the BPF offload code.  This way we can potentially reuse the
ifindex field in the future.

Since XDP and cls_bpf programs can only be loaded by admin,
we can drop the explicit capability check from offload code.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/offload.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index fd696d3dd429..ac187f9ee182 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -14,8 +14,9 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	struct net *net = current->nsproxy->net_ns;
 	struct bpf_dev_offload *offload;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+	    attr->prog_type != BPF_PROG_TYPE_XDP)
+		return -EINVAL;
 
 	if (attr->prog_flags)
 		return -EINVAL;
-- 
cgit v1.3-14-g43fede


From 1f6f4cb7ba219b00a3fa9afe8049fa16444d8b52 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:53 -0800
Subject: bpf: offload: rename the ifindex field

bpf_target_prog seems long and clunky, rename it to prog_ifindex.
We don't want to call this field just ifindex, because maps
may need a similar field in the future and bpf_attr members for
programs and maps are unnamed.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/uapi/linux/bpf.h       | 2 +-
 kernel/bpf/offload.c           | 2 +-
 kernel/bpf/syscall.c           | 4 ++--
 tools/include/uapi/linux/bpf.h | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
-		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index ac187f9ee182..a778e5df7e26 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -29,7 +29,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
 	init_waitqueue_head(&offload->verifier_done);
 
 	rtnl_lock();
-	offload->netdev = __dev_get_by_index(net, attr->prog_target_ifindex);
+	offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
 	if (!offload->netdev) {
 		rtnl_unlock();
 		kfree(offload);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 09badc37e864..8e9d065bb7cd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1118,7 +1118,7 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_target_ifindex
+#define	BPF_PROG_LOAD_LAST_FIELD prog_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1181,7 +1181,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
-	if (attr->prog_target_ifindex) {
+	if (attr->prog_ifindex) {
 		err = bpf_prog_offload_init(prog, attr);
 		if (err)
 			goto free_prog;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e880ae6434ee..3f626df42516 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -262,7 +262,7 @@ union bpf_attr {
 		__u32		kern_version;	/* checked when prog_type=kprobe */
 		__u32		prog_flags;
 		char		prog_name[BPF_OBJ_NAME_LEN];
-		__u32		prog_target_ifindex;	/* ifindex of netdev to prep for */
+		__u32		prog_ifindex;	/* ifindex of netdev to prep for */
 	};
 
 	struct { /* anonymous struct used by BPF_OBJ_* commands */
-- 
cgit v1.3-14-g43fede


From 288b3de55aace830f13280985ec9e6bcbff33b1b Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:54 -0800
Subject: bpf: offload: move offload device validation out to the drivers

With TC shared block changes we can't depend on correct netdev
pointer being available in cls_bpf.  Move the device validation
to the driver.  Core will only make sure that offloaded programs
are always attached in the driver (or in HW by the driver).  We
trust that drivers which implement offload callbacks will perform
necessary checks.

Moving the checks to the driver is generally a useful thing,
in practice the check should be against a switchdev instance,
not a netdev, given that most ASICs will probably allow using
the same program on many ports.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 drivers/net/ethernet/netronome/nfp/bpf/offload.c | 10 ++++++++--
 include/linux/bpf.h                              |  4 ++--
 kernel/bpf/syscall.c                             | 23 ++++++++++++-----------
 net/core/dev.c                                   |  7 ++-----
 net/sched/cls_bpf.c                              |  8 +++-----
 5 files changed, 27 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
index b6cee71f49d3..bc879aeb62d4 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c
@@ -214,8 +214,14 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog,
 {
 	int err;
 
-	if (prog && !prog->aux->offload)
-		return -EINVAL;
+	if (prog) {
+		struct bpf_dev_offload *offload = prog->aux->offload;
+
+		if (!offload)
+			return -EINVAL;
+		if (offload->netdev != nn->dp.netdev)
+			return -EINVAL;
+	}
 
 	if (prog && old_prog) {
 		u8 cap;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c397934f91dd..f82be640731e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -336,7 +336,7 @@ extern const struct bpf_verifier_ops xdp_analyzer_ops;
 struct bpf_prog *bpf_prog_get(u32 ufd);
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
-				       struct net_device *netdev);
+				       bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
 void bpf_prog_sub(struct bpf_prog *prog, int i);
 struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog);
@@ -433,7 +433,7 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 						     enum bpf_prog_type type,
-						     struct net_device *netdev)
+						     bool attach_drv)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8e9d065bb7cd..38da55905ab0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1057,22 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static bool bpf_prog_can_attach(struct bpf_prog *prog,
-				enum bpf_prog_type *attach_type,
-				struct net_device *netdev)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+			    enum bpf_prog_type *attach_type, bool attach_drv)
 {
-	struct bpf_dev_offload *offload = prog->aux->offload;
+	/* not an attachment, just a refcount inc, always allow */
+	if (!attach_type)
+		return true;
 
 	if (prog->type != *attach_type)
 		return false;
-	if (offload && offload->netdev != netdev)
+	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
 		return false;
 
 	return true;
 }
 
 static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
-				       struct net_device *netdev)
+				       bool attach_drv)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -1080,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (attach_type && !bpf_prog_can_attach(prog, attach_type, netdev)) {
+	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -1093,12 +1094,12 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL, NULL);
+	return __bpf_prog_get(ufd, NULL, false);
 }
 
 struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, NULL);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
@@ -1107,9 +1108,9 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
 EXPORT_SYMBOL_GPL(bpf_prog_get_type);
 
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
-				       struct net_device *netdev)
+				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, netdev);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ee29f4f5fa9..09525a27319c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7139,11 +7139,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		    __dev_xdp_attached(dev, bpf_op, NULL))
 			return -EBUSY;
 
-		if (bpf_op == ops->ndo_bpf)
-			prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
-						     dev);
-		else
-			prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+					     bpf_op == ops->ndo_bpf);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index fb680dafac5a..a9f3e317055c 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -382,15 +382,13 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
 {
 	struct bpf_prog *fp;
 	char *name = NULL;
+	bool skip_sw;
 	u32 bpf_fd;
 
 	bpf_fd = nla_get_u32(tb[TCA_BPF_FD]);
+	skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW;
 
-	if (gen_flags & TCA_CLS_FLAGS_SKIP_SW)
-		fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS,
-					   qdisc_dev(tp->q));
-	else
-		fp = bpf_prog_get_type(bpf_fd, BPF_PROG_TYPE_SCHED_CLS);
+	fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw);
 	if (IS_ERR(fp))
 		return PTR_ERR(fp);
 
-- 
cgit v1.3-14-g43fede


From 479321e9c31a6c05426790b11888427400f75ac8 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:56 -0800
Subject: bpf: turn bpf_prog_get_type() into a wrapper

bpf_prog_get_type() is identical to bpf_prog_get_type_dev(),
with false passed as attach_drv.  Instead of keeping it as
an exported symbol turn it into static inline wrapper.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h  | 13 ++++++-------
 kernel/bpf/syscall.c | 10 ----------
 2 files changed, 6 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f82be640731e..37bbab8c0f56 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -334,7 +334,6 @@ extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops;
 extern const struct bpf_verifier_ops xdp_analyzer_ops;
 
 struct bpf_prog *bpf_prog_get(u32 ufd);
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type);
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv);
 struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i);
@@ -425,12 +424,6 @@ static inline struct bpf_prog *bpf_prog_get(u32 ufd)
 	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
-						 enum bpf_prog_type type)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd,
 						     enum bpf_prog_type type,
 						     bool attach_drv)
@@ -514,6 +507,12 @@ static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
 }
 #endif /* CONFIG_BPF_SYSCALL */
 
+static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
+						 enum bpf_prog_type type)
+{
+	return bpf_prog_get_type_dev(ufd, type, false);
+}
+
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
 u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 38da55905ab0..41509cf825d8 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1097,16 +1097,6 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
 	return __bpf_prog_get(ufd, NULL, false);
 }
 
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
-{
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, false);
-
-	if (!IS_ERR(prog))
-		trace_bpf_prog_get_type(prog);
-	return prog;
-}
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
-
 struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
 				       bool attach_drv)
 {
-- 
cgit v1.3-14-g43fede


From 62c71b45e8537b8cb746cc929ea05ba0258e0b5a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:57 -0800
Subject: bpf: offload: ignore namespace moves

We are currently destroying the device offload state when device
moves to another net namespace.  This doesn't break with current
NFP code, because offload state is not used on program removal,
but it's not correct behaviour.

Ignore the device unregister notifications on namespace move.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/bpf/offload.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index a778e5df7e26..d4267c674fec 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -174,6 +174,10 @@ static int bpf_offload_notification(struct notifier_block *notifier,
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
+		/* ignore namespace changes */
+		if (netdev->reg_state != NETREG_UNREGISTERING)
+			break;
+
 		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
 					 offloads) {
 			if (offload->netdev == netdev)
-- 
cgit v1.3-14-g43fede


From 1ee640095f049e7ac4ec36b985abada497b98cc2 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Mon, 20 Nov 2017 15:21:59 -0800
Subject: bpf: revert report offload info to user space

This reverts commit bd601b6ada11 ("bpf: report offload info to user
space").  The ifindex by itself is not sufficient, we should provide
information on which network namespace this ifindex belongs to.
After considering some options we concluded that it's best to just
remove this API for now, and rework it in -next.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h      |  1 -
 include/uapi/linux/bpf.h |  6 ------
 kernel/bpf/offload.c     | 12 ------------
 kernel/bpf/syscall.c     |  5 -----
 4 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 37bbab8c0f56..76c577281d78 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -515,7 +515,6 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd,
 
 int bpf_prog_offload_compile(struct bpf_prog *prog);
 void bpf_prog_offload_destroy(struct bpf_prog *prog);
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog);
 
 #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL)
 int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 3f626df42516..4c223ab30293 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -897,10 +897,6 @@ enum sk_action {
 
 #define BPF_TAG_SIZE	8
 
-enum bpf_prog_status {
-	BPF_PROG_STATUS_DEV_BOUND	= (1 << 0),
-};
-
 struct bpf_prog_info {
 	__u32 type;
 	__u32 id;
@@ -914,8 +910,6 @@ struct bpf_prog_info {
 	__u32 nr_map_ids;
 	__aligned_u64 map_ids;
 	char name[BPF_OBJ_NAME_LEN];
-	__u32 ifindex;
-	__u32 status;
 } __attribute__((aligned(8)));
 
 struct bpf_map_info {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index d4267c674fec..68ec884440b7 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -149,18 +149,6 @@ int bpf_prog_offload_compile(struct bpf_prog *prog)
 	return bpf_prog_offload_translate(prog);
 }
 
-u32 bpf_prog_offload_ifindex(struct bpf_prog *prog)
-{
-	struct bpf_dev_offload *offload = prog->aux->offload;
-	u32 ifindex;
-
-	rtnl_lock();
-	ifindex = offload->netdev ? offload->netdev->ifindex : 0;
-	rtnl_unlock();
-
-	return ifindex;
-}
-
 const struct bpf_prog_ops bpf_offload_prog_ops = {
 };
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 41509cf825d8..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1616,11 +1616,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 			return -EFAULT;
 	}
 
-	if (bpf_prog_is_dev_bound(prog->aux)) {
-		info.status |= BPF_PROG_STATUS_DEV_BOUND;
-		info.ifindex = bpf_prog_offload_ifindex(prog);
-	}
-
 done:
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
-- 
cgit v1.3-14-g43fede


From 24ed960abf1d50cb7834e99a0cfc081bc0656712 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 28 Aug 2017 11:28:21 -0700
Subject: treewide: Switch DEFINE_TIMER callbacks to struct timer_list *

This changes all DEFINE_TIMER() callbacks to use a struct timer_list
pointer instead of unsigned long. Since the data argument has already been
removed, none of these callbacks are using their argument currently, so
this renames the argument to "unused".

Done using the following semantic patch:

@match_define_timer@
declarer name DEFINE_TIMER;
identifier _timer, _callback;
@@

 DEFINE_TIMER(_timer, _callback);

@change_callback depends on match_define_timer@
identifier match_define_timer._callback;
type _origtype;
identifier _origarg;
@@

 void
-_callback(_origtype _origarg)
+_callback(struct timer_list *unused)
 { ... }

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/mach-ixp4xx/dsmg600-setup.c      |  4 ++--
 arch/arm/mach-ixp4xx/nas100d-setup.c      |  4 ++--
 arch/m68k/amiga/amisound.c                |  4 ++--
 arch/m68k/mac/macboing.c                  |  4 ++--
 arch/mips/mti-malta/malta-display.c       |  4 ++--
 arch/parisc/kernel/pdc_cons.c             |  4 ++--
 drivers/atm/idt77105.c                    |  8 ++++----
 drivers/atm/iphase.c                      |  4 ++--
 drivers/block/ataflop.c                   | 16 ++++++++--------
 drivers/char/dtlk.c                       |  4 ++--
 drivers/char/hangcheck-timer.c            |  4 ++--
 drivers/char/nwbutton.c                   |  4 ++--
 drivers/char/nwbutton.h                   |  2 +-
 drivers/char/rtc.c                        |  4 ++--
 drivers/input/touchscreen/s3c2410_ts.c    |  2 +-
 drivers/net/wireless/atmel/at76c50x-usb.c |  4 ++--
 drivers/staging/speakup/main.c            |  4 ++--
 drivers/staging/speakup/synth.c           |  2 +-
 drivers/tty/cyclades.c                    |  4 ++--
 drivers/tty/isicom.c                      |  4 ++--
 drivers/tty/moxa.c                        |  4 ++--
 drivers/tty/rocket.c                      |  4 ++--
 drivers/tty/vt/keyboard.c                 |  2 +-
 drivers/tty/vt/vt.c                       |  4 ++--
 drivers/watchdog/alim7101_wdt.c           |  4 ++--
 drivers/watchdog/machzwd.c                |  4 ++--
 drivers/watchdog/mixcomwd.c               |  4 ++--
 drivers/watchdog/sbc60xxwdt.c             |  4 ++--
 drivers/watchdog/sc520_wdt.c              |  4 ++--
 drivers/watchdog/via_wdt.c                |  4 ++--
 drivers/watchdog/w83877f_wdt.c            |  4 ++--
 drivers/xen/grant-table.c                 |  4 ++--
 fs/pstore/platform.c                      |  4 ++--
 kernel/irq/spurious.c                     |  4 ++--
 lib/random32.c                            |  4 ++--
 net/decnet/dn_route.c                     |  4 ++--
 net/ipv6/ip6_flowlabel.c                  |  4 ++--
 net/netrom/nr_loopback.c                  |  4 ++--
 security/keys/gc.c                        |  4 ++--
 39 files changed, 82 insertions(+), 82 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-ixp4xx/dsmg600-setup.c b/arch/arm/mach-ixp4xx/dsmg600-setup.c
index ac97a4599034..0f5c99941a7d 100644
--- a/arch/arm/mach-ixp4xx/dsmg600-setup.c
+++ b/arch/arm/mach-ixp4xx/dsmg600-setup.c
@@ -179,10 +179,10 @@ static int power_button_countdown;
 /* Must hold the button down for at least this many counts to be processed */
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
-static void dsmg600_power_handler(unsigned long data);
+static void dsmg600_power_handler(struct timer_list *unused);
 static DEFINE_TIMER(dsmg600_power_timer, dsmg600_power_handler);
 
-static void dsmg600_power_handler(unsigned long data)
+static void dsmg600_power_handler(struct timer_list *unused)
 {
 	/* This routine is called twice per second to check the
 	 * state of the power button.
diff --git a/arch/arm/mach-ixp4xx/nas100d-setup.c b/arch/arm/mach-ixp4xx/nas100d-setup.c
index 435602085408..76dfff03cb71 100644
--- a/arch/arm/mach-ixp4xx/nas100d-setup.c
+++ b/arch/arm/mach-ixp4xx/nas100d-setup.c
@@ -202,10 +202,10 @@ static int power_button_countdown;
 /* Must hold the button down for at least this many counts to be processed */
 #define PBUTTON_HOLDDOWN_COUNT 4 /* 2 secs */
 
-static void nas100d_power_handler(unsigned long data);
+static void nas100d_power_handler(struct timer_list *unused);
 static DEFINE_TIMER(nas100d_power_timer, nas100d_power_handler);
 
-static void nas100d_power_handler(unsigned long data)
+static void nas100d_power_handler(struct timer_list *unused)
 {
 	/* This routine is called twice per second to check the
 	 * state of the power button.
diff --git a/arch/m68k/amiga/amisound.c b/arch/m68k/amiga/amisound.c
index a23f48181fd6..442bdeee6bd7 100644
--- a/arch/m68k/amiga/amisound.c
+++ b/arch/m68k/amiga/amisound.c
@@ -65,7 +65,7 @@ void __init amiga_init_sound(void)
 #endif
 }
 
-static void nosound( unsigned long ignored );
+static void nosound(struct timer_list *unused);
 static DEFINE_TIMER(sound_timer, nosound);
 
 void amiga_mksound( unsigned int hz, unsigned int ticks )
@@ -107,7 +107,7 @@ void amiga_mksound( unsigned int hz, unsigned int ticks )
 }
 
 
-static void nosound( unsigned long ignored )
+static void nosound(struct timer_list *unused)
 {
 	/* turn off DMA for audio channel 2 */
 	custom.dmacon = DMAF_AUD2;
diff --git a/arch/m68k/mac/macboing.c b/arch/m68k/mac/macboing.c
index d17668649641..135a87bbd1a2 100644
--- a/arch/m68k/mac/macboing.c
+++ b/arch/m68k/mac/macboing.c
@@ -48,7 +48,7 @@ static unsigned long mac_bell_phasepersample;
  * some function protos
  */
 static void mac_init_asc( void );
-static void mac_nosound( unsigned long );
+static void mac_nosound(struct timer_list *);
 static void mac_quadra_start_bell( unsigned int, unsigned int, unsigned int );
 static void mac_quadra_ring_bell( unsigned long );
 static void mac_av_start_bell( unsigned int, unsigned int, unsigned int );
@@ -216,7 +216,7 @@ void mac_mksound( unsigned int freq, unsigned int length )
 /*
  * regular ASC: stop whining ..
  */
-static void mac_nosound( unsigned long ignored )
+static void mac_nosound(struct timer_list *unused)
 {
 	mac_asc_regs[ ASC_ENABLE ] = 0;
 }
diff --git a/arch/mips/mti-malta/malta-display.c b/arch/mips/mti-malta/malta-display.c
index 063de44675ce..ee0bd50f754b 100644
--- a/arch/mips/mti-malta/malta-display.c
+++ b/arch/mips/mti-malta/malta-display.c
@@ -36,10 +36,10 @@ void mips_display_message(const char *str)
 	}
 }
 
-static void scroll_display_message(unsigned long unused);
+static void scroll_display_message(struct timer_list *unused);
 static DEFINE_TIMER(mips_scroll_timer, scroll_display_message);
 
-static void scroll_display_message(unsigned long unused)
+static void scroll_display_message(struct timer_list *unused)
 {
 	mips_display_message(&display_string[display_count++]);
 	if (display_count == max_display_count)
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 27a2dd616a7d..c46bf29ae412 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -91,7 +91,7 @@ static int pdc_console_setup(struct console *co, char *options)
 
 #define PDC_CONS_POLL_DELAY (30 * HZ / 1000)
 
-static void pdc_console_poll(unsigned long unused);
+static void pdc_console_poll(struct timer_list *unused);
 static DEFINE_TIMER(pdc_console_timer, pdc_console_poll);
 static struct tty_port tty_port;
 
@@ -135,7 +135,7 @@ static const struct tty_operations pdc_console_tty_ops = {
 	.chars_in_buffer = pdc_console_tty_chars_in_buffer,
 };
 
-static void pdc_console_poll(unsigned long unused)
+static void pdc_console_poll(struct timer_list *unused)
 {
 	int data, count = 0;
 
diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c
index 909744eb7bab..0a67487c0b1d 100644
--- a/drivers/atm/idt77105.c
+++ b/drivers/atm/idt77105.c
@@ -45,8 +45,8 @@ static DEFINE_SPINLOCK(idt77105_priv_lock);
 #define PUT(val,reg) dev->ops->phy_put(dev,val,IDT77105_##reg)
 #define GET(reg) dev->ops->phy_get(dev,IDT77105_##reg)
 
-static void idt77105_stats_timer_func(unsigned long);
-static void idt77105_restart_timer_func(unsigned long);
+static void idt77105_stats_timer_func(struct timer_list *);
+static void idt77105_restart_timer_func(struct timer_list *);
 
 
 static DEFINE_TIMER(stats_timer, idt77105_stats_timer_func);
@@ -80,7 +80,7 @@ static u16 get_counter(struct atm_dev *dev, int counter)
  * a separate copy of the stats allows implementation of
  * an ioctl which gathers the stats *without* zero'ing them.
  */
-static void idt77105_stats_timer_func(unsigned long dummy)
+static void idt77105_stats_timer_func(struct timer_list *unused)
 {
 	struct idt77105_priv *walk;
 	struct atm_dev *dev;
@@ -109,7 +109,7 @@ static void idt77105_stats_timer_func(unsigned long dummy)
  * interrupts need to be disabled when the cable is pulled out
  * to avoid lots of spurious cell error interrupts.
  */
-static void idt77105_restart_timer_func(unsigned long dummy)
+static void idt77105_restart_timer_func(struct timer_list *unused)
 {
 	struct idt77105_priv *walk;
 	struct atm_dev *dev;
diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 12f646760b68..98a3a43484c8 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -75,7 +75,7 @@ static void desc_dbg(IADEV *iadev);
 static IADEV *ia_dev[8];
 static struct atm_dev *_ia_dev[8];
 static int iadev_count;
-static void ia_led_timer(unsigned long arg);
+static void ia_led_timer(struct timer_list *unused);
 static DEFINE_TIMER(ia_timer, ia_led_timer);
 static int IA_TX_BUF = DFL_TX_BUFFERS, IA_TX_BUF_SZ = DFL_TX_BUF_SZ;
 static int IA_RX_BUF = DFL_RX_BUFFERS, IA_RX_BUF_SZ = DFL_RX_BUF_SZ;
@@ -2432,7 +2432,7 @@ static void ia_update_stats(IADEV *iadev) {
     return;
 }
   
-static void ia_led_timer(unsigned long arg) {
+static void ia_led_timer(struct timer_list *unused) {
  	unsigned long flags;
   	static u_char blinking[8] = {0, 0, 0, 0, 0, 0, 0, 0};
         u_char i;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index ae596e55bcb6..8bc3b9fd8dd2 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -342,8 +342,8 @@ static int NeedSeek = 0;
 static void fd_select_side( int side );
 static void fd_select_drive( int drive );
 static void fd_deselect( void );
-static void fd_motor_off_timer( unsigned long dummy );
-static void check_change( unsigned long dummy );
+static void fd_motor_off_timer(struct timer_list *unused);
+static void check_change(struct timer_list *unused);
 static irqreturn_t floppy_irq (int irq, void *dummy);
 static void fd_error( void );
 static int do_format(int drive, int type, struct atari_format_descr *desc);
@@ -353,12 +353,12 @@ static void fd_calibrate_done( int status );
 static void fd_seek( void );
 static void fd_seek_done( int status );
 static void fd_rwsec( void );
-static void fd_readtrack_check( unsigned long dummy );
+static void fd_readtrack_check(struct timer_list *unused);
 static void fd_rwsec_done( int status );
 static void fd_rwsec_done1(int status);
 static void fd_writetrack( void );
 static void fd_writetrack_done( int status );
-static void fd_times_out( unsigned long dummy );
+static void fd_times_out(struct timer_list *unused);
 static void finish_fdc( void );
 static void finish_fdc_done( int dummy );
 static void setup_req_params( int drive );
@@ -479,7 +479,7 @@ static void fd_deselect( void )
  * counts the index signals, which arrive only if one drive is selected.
  */
 
-static void fd_motor_off_timer( unsigned long dummy )
+static void fd_motor_off_timer(struct timer_list *unused)
 {
 	unsigned char status;
 
@@ -515,7 +515,7 @@ static void fd_motor_off_timer( unsigned long dummy )
  * as possible) and keep track of the current state of the write protection.
  */
 
-static void check_change( unsigned long dummy )
+static void check_change(struct timer_list *unused)
 {
 	static int    drive = 0;
 
@@ -966,7 +966,7 @@ static void fd_rwsec( void )
 }
 
     
-static void fd_readtrack_check( unsigned long dummy )
+static void fd_readtrack_check(struct timer_list *unused)
 {
 	unsigned long flags, addr, addr2;
 
@@ -1237,7 +1237,7 @@ static void fd_writetrack_done( int status )
 	fd_error();
 }
 
-static void fd_times_out( unsigned long dummy )
+static void fd_times_out(struct timer_list *unused)
 {
 	atari_disable_irq( IRQ_MFP_FDC );
 	if (!FloppyIRQHandler) goto end; /* int occurred after timer was fired, but
diff --git a/drivers/char/dtlk.c b/drivers/char/dtlk.c
index 1a0385ed6417..839ee61d352a 100644
--- a/drivers/char/dtlk.c
+++ b/drivers/char/dtlk.c
@@ -74,7 +74,7 @@
 #endif				/* TRACING */
 
 static DEFINE_MUTEX(dtlk_mutex);
-static void dtlk_timer_tick(unsigned long data);
+static void dtlk_timer_tick(struct timer_list *unused);
 
 static int dtlk_major;
 static int dtlk_port_lpc;
@@ -259,7 +259,7 @@ static unsigned int dtlk_poll(struct file *file, poll_table * wait)
 	return mask;
 }
 
-static void dtlk_timer_tick(unsigned long data)
+static void dtlk_timer_tick(struct timer_list *unused)
 {
 	TRACE_TEXT(" dtlk_timer_tick");
 	wake_up_interruptible(&dtlk_process_list);
diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c
index 5b8db2ed844d..7700280717f2 100644
--- a/drivers/char/hangcheck-timer.c
+++ b/drivers/char/hangcheck-timer.c
@@ -122,11 +122,11 @@ __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks);
 /* Last time scheduled */
 static unsigned long long hangcheck_tsc, hangcheck_tsc_margin;
 
-static void hangcheck_fire(unsigned long);
+static void hangcheck_fire(struct timer_list *);
 
 static DEFINE_TIMER(hangcheck_ticktock, hangcheck_fire);
 
-static void hangcheck_fire(unsigned long data)
+static void hangcheck_fire(struct timer_list *unused)
 {
 	unsigned long long cur_tsc, tsc_diff;
 
diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c
index 44006ed9558f..a7113b78251a 100644
--- a/drivers/char/nwbutton.c
+++ b/drivers/char/nwbutton.c
@@ -23,7 +23,7 @@
 #define __NWBUTTON_C		/* Tell the header file who we are */
 #include "nwbutton.h"
 
-static void button_sequence_finished (unsigned long parameters);
+static void button_sequence_finished(struct timer_list *unused);
 
 static int button_press_count;		/* The count of button presses */
 /* Times for the end of a sequence */
@@ -127,7 +127,7 @@ static void button_consume_callbacks (int bpcount)
  * any matching registered function callbacks, initiate reboot, etc.).
  */
 
-static void button_sequence_finished (unsigned long parameters)
+static void button_sequence_finished(struct timer_list *unused)
 {
 	if (IS_ENABLED(CONFIG_NWBUTTON_REBOOT) &&
 	    button_press_count == reboot_count)
diff --git a/drivers/char/nwbutton.h b/drivers/char/nwbutton.h
index abee3ca74801..9dedfd7adc0e 100644
--- a/drivers/char/nwbutton.h
+++ b/drivers/char/nwbutton.h
@@ -25,7 +25,7 @@ struct button_callback {
 
 /* Function prototypes: */
 
-static void button_sequence_finished (unsigned long parameters);
+static void button_sequence_finished(struct timer_list *unused);
 static irqreturn_t button_handler (int irq, void *dev_id);
 int button_init (void);
 int button_add_callback (void (*callback) (void), int count);
diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c
index 616871e68e09..5542a438bbd0 100644
--- a/drivers/char/rtc.c
+++ b/drivers/char/rtc.c
@@ -135,7 +135,7 @@ static struct fasync_struct *rtc_async_queue;
 static DECLARE_WAIT_QUEUE_HEAD(rtc_wait);
 
 #ifdef RTC_IRQ
-static void rtc_dropped_irq(unsigned long data);
+static void rtc_dropped_irq(struct timer_list *unused);
 
 static DEFINE_TIMER(rtc_irq_timer, rtc_dropped_irq);
 #endif
@@ -1171,7 +1171,7 @@ module_exit(rtc_exit);
  *	for something that requires a steady > 1KHz signal anyways.)
  */
 
-static void rtc_dropped_irq(unsigned long data)
+static void rtc_dropped_irq(struct timer_list *unused)
 {
 	unsigned long freq;
 
diff --git a/drivers/input/touchscreen/s3c2410_ts.c b/drivers/input/touchscreen/s3c2410_ts.c
index d3265b6b58b8..1173890f6719 100644
--- a/drivers/input/touchscreen/s3c2410_ts.c
+++ b/drivers/input/touchscreen/s3c2410_ts.c
@@ -102,7 +102,7 @@ static inline bool get_down(unsigned long data0, unsigned long data1)
 		!(data1 & S3C2410_ADCDAT0_UPDOWN));
 }
 
-static void touch_timer_fire(unsigned long data)
+static void touch_timer_fire(struct timer_list *unused)
 {
 	unsigned long data0;
 	unsigned long data1;
diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c
index ede89d4ffc88..e99e766a3028 100644
--- a/drivers/net/wireless/atmel/at76c50x-usb.c
+++ b/drivers/net/wireless/atmel/at76c50x-usb.c
@@ -518,11 +518,11 @@ exit:
 
 /* LED trigger */
 static int tx_activity;
-static void at76_ledtrig_tx_timerfunc(unsigned long data);
+static void at76_ledtrig_tx_timerfunc(struct timer_list *unused);
 static DEFINE_TIMER(ledtrig_tx_timer, at76_ledtrig_tx_timerfunc);
 DEFINE_LED_TRIGGER(ledtrig_tx);
 
-static void at76_ledtrig_tx_timerfunc(unsigned long data)
+static void at76_ledtrig_tx_timerfunc(struct timer_list *unused)
 {
 	static int tx_lastactivity;
 
diff --git a/drivers/staging/speakup/main.c b/drivers/staging/speakup/main.c
index 16497202473f..aae868509e13 100644
--- a/drivers/staging/speakup/main.c
+++ b/drivers/staging/speakup/main.c
@@ -1164,7 +1164,7 @@ static void spkup_write(const u16 *in_buf, int count)
 static const int NUM_CTL_LABELS = (MSG_CTL_END - MSG_CTL_START + 1);
 
 static void read_all_doc(struct vc_data *vc);
-static void cursor_done(u_long data);
+static void cursor_done(struct timer_list *unused);
 static DEFINE_TIMER(cursor_timer, cursor_done);
 
 static void do_handle_shift(struct vc_data *vc, u_char value, char up_flag)
@@ -1682,7 +1682,7 @@ static int speak_highlight(struct vc_data *vc)
 	return 0;
 }
 
-static void cursor_done(u_long data)
+static void cursor_done(struct timer_list *unused)
 {
 	struct vc_data *vc = vc_cons[cursor_con].d;
 	unsigned long flags;
diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c
index 6ddd3fc3f08d..aac29c816d09 100644
--- a/drivers/staging/speakup/synth.c
+++ b/drivers/staging/speakup/synth.c
@@ -153,7 +153,7 @@ int spk_synth_is_alive_restart(struct spk_synth *synth)
 }
 EXPORT_SYMBOL_GPL(spk_synth_is_alive_restart);
 
-static void thread_wake_up(u_long data)
+static void thread_wake_up(struct timer_list *unused)
 {
 	wake_up_interruptible_all(&speakup_event);
 }
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 5d442469c95e..cf0bde3bb927 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -279,7 +279,7 @@ static unsigned detect_isa_irq(void __iomem *);
 #endif				/* CONFIG_ISA */
 
 #ifndef CONFIG_CYZ_INTR
-static void cyz_poll(unsigned long);
+static void cyz_poll(struct timer_list *);
 
 /* The Cyclades-Z polling cycle is defined by this variable */
 static long cyz_polling_cycle = CZ_DEF_POLL;
@@ -1214,7 +1214,7 @@ static void cyz_rx_restart(struct timer_list *t)
 
 #else				/* CONFIG_CYZ_INTR */
 
-static void cyz_poll(unsigned long arg)
+static void cyz_poll(struct timer_list *unused)
 {
 	struct cyclades_card *cinfo;
 	struct cyclades_port *info;
diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c
index ee7958ab269f..015686ff4825 100644
--- a/drivers/tty/isicom.c
+++ b/drivers/tty/isicom.c
@@ -170,7 +170,7 @@ static struct pci_driver isicom_driver = {
 static int prev_card = 3;	/*	start servicing isi_card[0]	*/
 static struct tty_driver *isicom_normal;
 
-static void isicom_tx(unsigned long _data);
+static void isicom_tx(struct timer_list *unused);
 static void isicom_start(struct tty_struct *tty);
 
 static DEFINE_TIMER(tx, isicom_tx);
@@ -394,7 +394,7 @@ static inline int __isicom_paranoia_check(struct isi_port const *port,
  *	will do the rest of the work for us.
  */
 
-static void isicom_tx(unsigned long _data)
+static void isicom_tx(struct timer_list *unused)
 {
 	unsigned long flags, base;
 	unsigned int retries;
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index 65a70f3c7cde..68cbc03aab4b 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -198,7 +198,7 @@ static void moxa_hangup(struct tty_struct *);
 static int moxa_tiocmget(struct tty_struct *tty);
 static int moxa_tiocmset(struct tty_struct *tty,
 			 unsigned int set, unsigned int clear);
-static void moxa_poll(unsigned long);
+static void moxa_poll(struct timer_list *);
 static void moxa_set_tty_param(struct tty_struct *, struct ktermios *);
 static void moxa_shutdown(struct tty_port *);
 static int moxa_carrier_raised(struct tty_port *);
@@ -1429,7 +1429,7 @@ put:
 	return 0;
 }
 
-static void moxa_poll(unsigned long ignored)
+static void moxa_poll(struct timer_list *unused)
 {
 	struct moxa_board_conf *brd;
 	u16 __iomem *ip;
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index f7dc9b1ea806..bdd17d2aaafd 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -86,7 +86,7 @@
 
 /****** RocketPort Local Variables ******/
 
-static void rp_do_poll(unsigned long dummy);
+static void rp_do_poll(struct timer_list *unused);
 
 static struct tty_driver *rocket_driver;
 
@@ -525,7 +525,7 @@ static void rp_handle_port(struct r_port *info)
 /*
  *  The top level polling routine.  Repeats every 1/100 HZ (10ms).
  */
-static void rp_do_poll(unsigned long dummy)
+static void rp_do_poll(struct timer_list *unused)
 {
 	CONTROLLER_t *ctlp;
 	int ctrl, aiop, ch, line;
diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c
index c8d90d7e7e37..5d412df8e943 100644
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -244,7 +244,7 @@ static int kd_sound_helper(struct input_handle *handle, void *data)
 	return 0;
 }
 
-static void kd_nosound(unsigned long ignored)
+static void kd_nosound(struct timer_list *unused)
 {
 	static unsigned int zero;
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index bce4c71cb338..88b902c525d7 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -158,7 +158,7 @@ static void set_cursor(struct vc_data *vc);
 static void hide_cursor(struct vc_data *vc);
 static void console_callback(struct work_struct *ignored);
 static void con_driver_unregister_callback(struct work_struct *ignored);
-static void blank_screen_t(unsigned long dummy);
+static void blank_screen_t(struct timer_list *unused);
 static void set_palette(struct vc_data *vc);
 
 #define vt_get_kmsg_redirect() vt_kmsg_redirect(-1)
@@ -3929,7 +3929,7 @@ void unblank_screen(void)
  * (console operations can still happen at irq time, but only from printk which
  * has the console mutex. Not perfect yet, but better than no locking
  */
-static void blank_screen_t(unsigned long dummy)
+static void blank_screen_t(struct timer_list *unused)
 {
 	blank_timer_expired = 1;
 	schedule_work(&console_work);
diff --git a/drivers/watchdog/alim7101_wdt.c b/drivers/watchdog/alim7101_wdt.c
index 18e896eeca62..12f7ea62dddd 100644
--- a/drivers/watchdog/alim7101_wdt.c
+++ b/drivers/watchdog/alim7101_wdt.c
@@ -70,7 +70,7 @@ module_param(use_gpio, int, 0);
 MODULE_PARM_DESC(use_gpio,
 		"Use the gpio watchdog (required by old cobalt boards).");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -87,7 +87,7 @@ MODULE_PARM_DESC(nowayout,
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long unused)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/machzwd.c b/drivers/watchdog/machzwd.c
index 8a616a57bb90..88d823d87a4b 100644
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -121,7 +121,7 @@ module_param(action, int, 0);
 MODULE_PARM_DESC(action, "after watchdog resets, generate: "
 				"0 = RESET(*)  1 = SMI  2 = NMI  3 = SCI");
 
-static void zf_ping(unsigned long data);
+static void zf_ping(struct timer_list *unused);
 
 static int zf_action = GEN_RESET;
 static unsigned long zf_is_open;
@@ -237,7 +237,7 @@ static void zf_timer_on(void)
 }
 
 
-static void zf_ping(unsigned long data)
+static void zf_ping(struct timer_list *unused)
 {
 	unsigned int ctrl_reg = 0;
 	unsigned long flags;
diff --git a/drivers/watchdog/mixcomwd.c b/drivers/watchdog/mixcomwd.c
index c9e38096ea91..3cc07447c655 100644
--- a/drivers/watchdog/mixcomwd.c
+++ b/drivers/watchdog/mixcomwd.c
@@ -99,7 +99,7 @@ static struct {
 	{0x0000, 0},
 };
 
-static void mixcomwd_timerfun(unsigned long d);
+static void mixcomwd_timerfun(struct timer_list *unused);
 
 static unsigned long mixcomwd_opened; /* long req'd for setbit --RR */
 
@@ -120,7 +120,7 @@ static void mixcomwd_ping(void)
 	return;
 }
 
-static void mixcomwd_timerfun(unsigned long d)
+static void mixcomwd_timerfun(struct timer_list *unused)
 {
 	mixcomwd_ping();
 	mod_timer(&mixcomwd_timer, jiffies + 5 * HZ);
diff --git a/drivers/watchdog/sbc60xxwdt.c b/drivers/watchdog/sbc60xxwdt.c
index 8d589939bc84..87333a41f753 100644
--- a/drivers/watchdog/sbc60xxwdt.c
+++ b/drivers/watchdog/sbc60xxwdt.c
@@ -112,7 +112,7 @@ MODULE_PARM_DESC(nowayout,
 	"Watchdog cannot be stopped once started (default="
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -122,7 +122,7 @@ static char wdt_expect_close;
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/sc520_wdt.c b/drivers/watchdog/sc520_wdt.c
index 3e9bbaa37bf4..6aadb56e7faa 100644
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -123,7 +123,7 @@ MODULE_PARM_DESC(nowayout,
 
 static __u16 __iomem *wdtmrctl;
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -134,7 +134,7 @@ static DEFINE_SPINLOCK(wdt_spinlock);
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/watchdog/via_wdt.c b/drivers/watchdog/via_wdt.c
index ad3c3be13b40..b085ef1084ec 100644
--- a/drivers/watchdog/via_wdt.c
+++ b/drivers/watchdog/via_wdt.c
@@ -67,7 +67,7 @@ static struct watchdog_device wdt_dev;
 static struct resource wdt_res;
 static void __iomem *wdt_mem;
 static unsigned int mmio;
-static void wdt_timer_tick(unsigned long data);
+static void wdt_timer_tick(struct timer_list *unused);
 static DEFINE_TIMER(timer, wdt_timer_tick);
 					/* The timer that pings the watchdog */
 static unsigned long next_heartbeat;	/* the next_heartbeat for the timer */
@@ -88,7 +88,7 @@ static inline void wdt_reset(void)
  *     then the external/userspace heartbeat).
  *  2) the watchdog timer has been stopped by userspace.
  */
-static void wdt_timer_tick(unsigned long data)
+static void wdt_timer_tick(struct timer_list *unused)
 {
 	if (time_before(jiffies, next_heartbeat) ||
 	   (!watchdog_active(&wdt_dev))) {
diff --git a/drivers/watchdog/w83877f_wdt.c b/drivers/watchdog/w83877f_wdt.c
index ba6b680af100..05658ecc0aa4 100644
--- a/drivers/watchdog/w83877f_wdt.c
+++ b/drivers/watchdog/w83877f_wdt.c
@@ -97,7 +97,7 @@ MODULE_PARM_DESC(nowayout,
 		"Watchdog cannot be stopped once started (default="
 				__MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-static void wdt_timer_ping(unsigned long);
+static void wdt_timer_ping(struct timer_list *);
 static DEFINE_TIMER(timer, wdt_timer_ping);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
@@ -108,7 +108,7 @@ static DEFINE_SPINLOCK(wdt_spinlock);
  *	Whack the dog
  */
 
-static void wdt_timer_ping(unsigned long data)
+static void wdt_timer_ping(struct timer_list *unused)
 {
 	/* If we got a heartbeat pulse within the WDT_US_INTERVAL
 	 * we agree to ping the WDT
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 139e018a82b0..f45114fd8e1e 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -358,10 +358,10 @@ struct deferred_entry {
 	struct page *page;
 };
 static LIST_HEAD(deferred_list);
-static void gnttab_handle_deferred(unsigned long);
+static void gnttab_handle_deferred(struct timer_list *);
 static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
 
-static void gnttab_handle_deferred(unsigned long unused)
+static void gnttab_handle_deferred(struct timer_list *unused)
 {
 	unsigned int nr = 10;
 	struct deferred_entry *first = NULL;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 423159abd501..691032107f8c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -61,7 +61,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content "
 
 static int pstore_new_entry;
 
-static void pstore_timefunc(unsigned long);
+static void pstore_timefunc(struct timer_list *);
 static DEFINE_TIMER(pstore_timer, pstore_timefunc);
 
 static void pstore_dowork(struct work_struct *);
@@ -890,7 +890,7 @@ static void pstore_dowork(struct work_struct *work)
 	pstore_get_records(1);
 }
 
-static void pstore_timefunc(unsigned long dummy)
+static void pstore_timefunc(struct timer_list *unused)
 {
 	if (pstore_new_entry) {
 		pstore_new_entry = 0;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
 static int irqfixup __read_mostly;
 
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
 {
 	struct irq_desc *desc;
 	int i;
diff --git a/lib/random32.c b/lib/random32.c
index 65cc018fef40..4aaa76404d56 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -213,11 +213,11 @@ static int __init prandom_init(void)
 }
 core_initcall(prandom_init);
 
-static void __prandom_timer(unsigned long dontcare);
+static void __prandom_timer(struct timer_list *unused);
 
 static DEFINE_TIMER(seed_timer, __prandom_timer);
 
-static void __prandom_timer(unsigned long dontcare)
+static void __prandom_timer(struct timer_list *unused)
 {
 	u32 entropy;
 	unsigned long expires;
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index b36dceab0dc1..de4a0cafb19f 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -125,7 +125,7 @@ static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
 					     struct sk_buff *skb,
 					     const void *daddr);
 static int dn_route_input(struct sk_buff *);
-static void dn_run_flush(unsigned long dummy);
+static void dn_run_flush(struct timer_list *unused);
 
 static struct dn_rt_hash_bucket *dn_rt_hash_table;
 static unsigned int dn_rt_hash_mask;
@@ -357,7 +357,7 @@ static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_rou
 	return 0;
 }
 
-static void dn_run_flush(unsigned long dummy)
+static void dn_run_flush(struct timer_list *unused)
 {
 	int i;
 	struct dn_route *rt, *next;
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 9f2e73c71768..7f59c8fabeeb 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -46,7 +46,7 @@
 static atomic_t fl_size = ATOMIC_INIT(0);
 static struct ip6_flowlabel __rcu *fl_ht[FL_HASH_MASK+1];
 
-static void ip6_fl_gc(unsigned long dummy);
+static void ip6_fl_gc(struct timer_list *unused);
 static DEFINE_TIMER(ip6_fl_gc_timer, ip6_fl_gc);
 
 /* FL hash table lock: it protects only of GC */
@@ -127,7 +127,7 @@ static void fl_release(struct ip6_flowlabel *fl)
 	spin_unlock_bh(&ip6_fl_lock);
 }
 
-static void ip6_fl_gc(unsigned long dummy)
+static void ip6_fl_gc(struct timer_list *unused)
 {
 	int i;
 	unsigned long now = jiffies;
diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c
index 989ae647825e..215ad22a9647 100644
--- a/net/netrom/nr_loopback.c
+++ b/net/netrom/nr_loopback.c
@@ -15,7 +15,7 @@
 #include <net/netrom.h>
 #include <linux/init.h>
 
-static void nr_loopback_timer(unsigned long);
+static void nr_loopback_timer(struct timer_list *);
 
 static struct sk_buff_head loopback_queue;
 static DEFINE_TIMER(loopback_timer, nr_loopback_timer);
@@ -48,7 +48,7 @@ int nr_loopback_queue(struct sk_buff *skb)
 	return 1;
 }
 
-static void nr_loopback_timer(unsigned long param)
+static void nr_loopback_timer(struct timer_list *unused)
 {
 	struct sk_buff *skb;
 	ax25_address *nr_dest;
diff --git a/security/keys/gc.c b/security/keys/gc.c
index afb3a9175d76..b93603724b8c 100644
--- a/security/keys/gc.c
+++ b/security/keys/gc.c
@@ -29,7 +29,7 @@ DECLARE_WORK(key_gc_work, key_garbage_collector);
 /*
  * Reaper for links from keyrings to dead keys.
  */
-static void key_gc_timer_func(unsigned long);
+static void key_gc_timer_func(struct timer_list *);
 static DEFINE_TIMER(key_gc_timer, key_gc_timer_func);
 
 static time_t key_gc_next_run = LONG_MAX;
@@ -84,7 +84,7 @@ void key_schedule_gc_links(void)
  * Some key's cleanup time was met after it expired, so we need to get the
  * reaper to go through a cycle finding expired keys.
  */
-static void key_gc_timer_func(unsigned long data)
+static void key_gc_timer_func(struct timer_list *unused)
 {
 	kenter("");
 	key_gc_next_run = LONG_MAX;
-- 
cgit v1.3-14-g43fede


From b9eaf18722221ef8b2bd6a67240ebe668622152a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 13:15:39 -0700
Subject: treewide: init_timer() -> setup_timer()

This mechanically converts all remaining cases of ancient open-coded timer
setup with the old setup_timer() API, which is the first step in timer
conversions. This has no behavioral changes, since it ultimately just
changes the order of assignment to fields of struct timer_list when
finding variations of:

    init_timer(&t);
    f.function = timer_callback;
    t.data = timer_callback_arg;

to be converted into:

    setup_timer(&t, timer_callback, timer_callback_arg);

The conversion is done with the following Coccinelle script, which
is an improved version of scripts/cocci/api/setup_timer.cocci, in the
following ways:
 - assignments-before-init_timer() cases
 - limit the .data case removal to the specific struct timer_list instance
 - handling calls by dereference (timer->field vs timer.field)

spatch --very-quiet --all-includes --include-headers \
	-I ./arch/x86/include -I ./arch/x86/include/generated \
	-I ./include -I ./arch/x86/include/uapi \
	-I ./arch/x86/include/generated/uapi -I ./include/uapi \
	-I ./include/generated/uapi --include ./include/linux/kconfig.h \
	--dir . \
	--cocci-file ~/src/data/setup_timer.cocci

@fix_address_of@
expression e;
@@

 init_timer(
-&(e)
+&e
 , ...)

// Match the common cases first to avoid Coccinelle parsing loops with
// "... when" clauses.

@match_immediate_function_data_after_init_timer@
expression e, func, da;
@@

-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );
(
-\(e.function\|e->function\) = func;
-\(e.data\|e->data\) = da;
|
-\(e.data\|e->data\) = da;
-\(e.function\|e->function\) = func;
)

@match_immediate_function_data_before_init_timer@
expression e, func, da;
@@

(
-\(e.function\|e->function\) = func;
-\(e.data\|e->data\) = da;
|
-\(e.data\|e->data\) = da;
-\(e.function\|e->function\) = func;
)
-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );

@match_function_and_data_after_init_timer@
expression e, e2, e3, e4, e5, func, da;
@@

-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );
 ... when != func = e2
     when != da = e3
(
-e.function = func;
... when != da = e4
-e.data = da;
|
-e->function = func;
... when != da = e4
-e->data = da;
|
-e.data = da;
... when != func = e5
-e.function = func;
|
-e->data = da;
... when != func = e5
-e->function = func;
)

@match_function_and_data_before_init_timer@
expression e, e2, e3, e4, e5, func, da;
@@
(
-e.function = func;
... when != da = e4
-e.data = da;
|
-e->function = func;
... when != da = e4
-e->data = da;
|
-e.data = da;
... when != func = e5
-e.function = func;
|
-e->data = da;
... when != func = e5
-e->function = func;
)
... when != func = e2
    when != da = e3
-init_timer
+setup_timer
 ( \(&e\|e\)
+, func, da
 );

@r1 exists@
expression t;
identifier f;
position p;
@@

f(...) { ... when any
  init_timer@p(\(&t\|t\))
  ... when any
}

@r2 exists@
expression r1.t;
identifier g != r1.f;
expression e8;
@@

g(...) { ... when any
  \(t.data\|t->data\) = e8
  ... when any
}

// It is dangerous to use setup_timer if data field is initialized
// in another function.
@script:python depends on r2@
p << r1.p;
@@

cocci.include_match(False)

@r3@
expression r1.t, func, e7;
position r1.p;
@@

(
-init_timer@p(&t);
+setup_timer(&t, func, 0UL);
... when != func = e7
-t.function = func;
|
-t.function = func;
... when != func = e7
-init_timer@p(&t);
+setup_timer(&t, func, 0UL);
|
-init_timer@p(t);
+setup_timer(t, func, 0UL);
... when != func = e7
-t->function = func;
|
-t->function = func;
... when != func = e7
-init_timer@p(t);
+setup_timer(t, func, 0UL);
)

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/arm/mach-iop32x/n2100.c                     |  3 +--
 arch/blackfin/kernel/nmi.c                       |  3 +--
 arch/sh/drivers/pci/common.c                     | 10 ++++------
 arch/sh/drivers/push-switch.c                    |  5 +----
 drivers/atm/firestream.c                         |  4 +---
 drivers/atm/lanai.c                              |  4 +---
 drivers/atm/nicstar.c                            |  4 +---
 drivers/block/DAC960.c                           |  5 ++---
 drivers/block/umem.c                             |  3 +--
 drivers/gpu/drm/omapdrm/dss/dsi.c                |  4 +---
 drivers/infiniband/hw/mthca/mthca_catas.c        |  4 +---
 drivers/isdn/i4l/isdn_common.c                   |  3 +--
 drivers/isdn/i4l/isdn_net.c                      |  6 +++---
 drivers/media/platform/s5p-mfc/s5p_mfc.c         |  5 ++---
 drivers/media/usb/au0828/au0828-dvb.c            |  5 ++---
 drivers/net/wireless/intersil/hostap/hostap_ap.c |  4 +---
 drivers/net/wireless/intersil/hostap/hostap_hw.c | 11 ++++-------
 drivers/nfc/pn533/pn533.c                        |  5 ++---
 drivers/nfc/st-nci/ndlc.c                        |  9 ++-------
 drivers/nfc/st-nci/se.c                          | 11 ++++-------
 drivers/nfc/st21nfca/se.c                        | 10 ++++------
 drivers/s390/block/dasd.c                        |  9 +++------
 drivers/s390/net/fsm.c                           |  4 +---
 drivers/scsi/arcmsr/arcmsr_hba.c                 | 10 ++++------
 drivers/scsi/arm/fas216.c                        |  4 +---
 drivers/scsi/bfa/bfad.c                          |  4 +---
 drivers/scsi/esas2r/esas2r_main.c                |  4 +---
 drivers/scsi/ncr53c8xx.c                         |  4 +---
 drivers/scsi/sym53c8xx_2/sym_glue.c              |  4 +---
 drivers/usb/gadget/udc/omap_udc.c                |  4 +---
 kernel/time/clocksource.c                        |  3 +--
 31 files changed, 55 insertions(+), 113 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index c1cd80ecc219..4a64a11ba63c 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -336,8 +336,7 @@ static int __init n2100_request_gpios(void)
 			pr_err("could not set power GPIO as input\n");
 	}
 	/* Set up power button poll timer */
-	init_timer(&power_button_poll_timer);
-	power_button_poll_timer.function = power_button_poll;
+	setup_timer(&power_button_poll_timer, power_button_poll, 0UL);
 	power_button_poll_timer.expires = jiffies + (HZ / 10);
 	add_timer(&power_button_poll_timer);
 	return 0;
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 1e714329fe8a..828f4fbdb58a 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -180,8 +180,7 @@ static int __init init_nmi_wdt(void)
 	nmi_wdt_start();
 	nmi_active = true;
 
-	init_timer(&ntimer);
-	ntimer.function = nmi_wdt_timer;
+	setup_timer(&ntimer, nmi_wdt_timer, 0UL);
 	ntimer.expires = jiffies + NMI_CHECK_TIMEOUT;
 	add_timer(&ntimer);
 
diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c
index cae707f3472d..0d7eb7b5ac8d 100644
--- a/arch/sh/drivers/pci/common.c
+++ b/arch/sh/drivers/pci/common.c
@@ -106,15 +106,13 @@ static void pcibios_enable_serr(unsigned long __data)
 void pcibios_enable_timers(struct pci_channel *hose)
 {
 	if (hose->err_irq) {
-		init_timer(&hose->err_timer);
-		hose->err_timer.data = (unsigned long)hose;
-		hose->err_timer.function = pcibios_enable_err;
+		setup_timer(&hose->err_timer, pcibios_enable_err,
+			    (unsigned long)hose);
 	}
 
 	if (hose->serr_irq) {
-		init_timer(&hose->serr_timer);
-		hose->serr_timer.data = (unsigned long)hose;
-		hose->serr_timer.function = pcibios_enable_serr;
+		setup_timer(&hose->serr_timer, pcibios_enable_serr,
+			    (unsigned long)hose);
 	}
 }
 
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 5bfb341cc5c4..2dc791507968 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -78,10 +78,7 @@ static int switch_drv_probe(struct platform_device *pdev)
 	}
 
 	INIT_WORK(&psw->work, switch_work_handler);
-	init_timer(&psw->debounce);
-
-	psw->debounce.function = switch_timer;
-	psw->debounce.data = (unsigned long)psw;
+	setup_timer(&psw->debounce, switch_timer, (unsigned long)psw);
 
 	/* Workqueue API brain-damage */
 	psw->pdev = pdev;
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 6b6368a56526..534001270be5 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -1885,9 +1885,7 @@ static int fs_init(struct fs_dev *dev)
 	}
 
 #ifdef FS_POLL_FREQ
-	init_timer (&dev->timer);
-	dev->timer.data = (unsigned long) dev;
-	dev->timer.function = fs_poll;
+	setup_timer (&dev->timer, fs_poll, (unsigned long)dev);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
 	add_timer (&dev->timer);
 #endif
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 2351dad78ff5..87e8b5dfac39 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1790,10 +1790,8 @@ static void lanai_timed_poll(unsigned long arg)
 
 static inline void lanai_timed_poll_start(struct lanai_dev *lanai)
 {
-	init_timer(&lanai->timer);
+	setup_timer(&lanai->timer, lanai_timed_poll, (unsigned long)lanai);
 	lanai->timer.expires = jiffies + LANAI_POLL_PERIOD;
-	lanai->timer.data = (unsigned long) lanai;
-	lanai->timer.function = lanai_timed_poll;
 	add_timer(&lanai->timer);
 }
 
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index a9702836cbae..335447ed0ba4 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -284,10 +284,8 @@ static int __init nicstar_init(void)
 	XPRINTK("nicstar: nicstar_init() returned.\n");
 
 	if (!error) {
-		init_timer(&ns_timer);
+		setup_timer(&ns_timer, ns_poll, 0UL);
 		ns_timer.expires = jiffies + NS_POLL_PERIOD;
-		ns_timer.data = 0UL;
-		ns_timer.function = ns_poll;
 		add_timer(&ns_timer);
 	}
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 255591ab3716..6f14cdd6015b 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3079,11 +3079,10 @@ DAC960_InitializeController(DAC960_Controller_T *Controller)
       /*
 	Initialize the Monitoring Timer.
       */
-      init_timer(&Controller->MonitoringTimer);
+      setup_timer(&Controller->MonitoringTimer,
+                  DAC960_MonitoringTimerFunction, (unsigned long)Controller);
       Controller->MonitoringTimer.expires =
 	jiffies + DAC960_MonitoringTimerInterval;
-      Controller->MonitoringTimer.data = (unsigned long) Controller;
-      Controller->MonitoringTimer.function = DAC960_MonitoringTimerFunction;
       add_timer(&Controller->MonitoringTimer);
       Controller->ControllerInitialized = true;
       return true;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 0677d2514665..b4d4ccfe7582 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -738,8 +738,7 @@ static void check_all_batteries(unsigned long ptr)
 
 static void init_battery_timer(void)
 {
-	init_timer(&battery_timer);
-	battery_timer.function = check_all_batteries;
+	setup_timer(&battery_timer, check_all_batteries, 0UL);
 	battery_timer.expires = jiffies + (HZ * 60);
 	add_timer(&battery_timer);
 }
diff --git a/drivers/gpu/drm/omapdrm/dss/dsi.c b/drivers/gpu/drm/omapdrm/dss/dsi.c
index b56a05730314..cea744e4d9bd 100644
--- a/drivers/gpu/drm/omapdrm/dss/dsi.c
+++ b/drivers/gpu/drm/omapdrm/dss/dsi.c
@@ -5449,9 +5449,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 			     dsi_framedone_timeout_work_callback);
 
 #ifdef DSI_CATCH_MISSING_TE
-	init_timer(&dsi->te_timer);
-	dsi->te_timer.function = dsi_te_timeout;
-	dsi->te_timer.data = 0;
+	setup_timer(&dsi->te_timer, dsi_te_timeout, 0);
 #endif
 
 	dsi_mem = platform_get_resource_byname(dsidev, IORESOURCE_MEM, "proto");
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index f6474c24f193..23cc08d5c24e 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -149,7 +149,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 {
 	phys_addr_t addr;
 
-	init_timer(&dev->catas_err.timer);
+	setup_timer(&dev->catas_err.timer, poll_catas, (unsigned long)dev);
 	dev->catas_err.map  = NULL;
 
 	addr = pci_resource_start(dev->pdev, 0) +
@@ -164,8 +164,6 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
 		return;
 	}
 
-	dev->catas_err.timer.data     = (unsigned long) dev;
-	dev->catas_err.timer.function = poll_catas;
 	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
 	INIT_LIST_HEAD(&dev->catas_err.list);
 	add_timer(&dev->catas_err.timer);
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 38a5bb764c7b..3fa2f7b31131 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -2294,8 +2294,7 @@ static int __init isdn_init(void)
 		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
 		return -EIO;
 	}
-	init_timer(&dev->timer);
-	dev->timer.function = isdn_timer_funct;
+	setup_timer(&dev->timer, isdn_timer_funct, 0UL);
 	spin_lock_init(&dev->lock);
 	spin_lock_init(&dev->timerlock);
 #ifdef MODULE
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index f63a110b7bcb..59d40160cab2 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1615,9 +1615,9 @@ isdn_net_ciscohdlck_connected(isdn_net_local *lp)
 	/* send slarp request because interface/seq.no.s reset */
 	isdn_net_ciscohdlck_slarp_send_request(lp);
 
-	init_timer(&lp->cisco_timer);
-	lp->cisco_timer.data = (unsigned long) lp;
-	lp->cisco_timer.function = isdn_net_ciscohdlck_slarp_send_keepalive;
+	setup_timer(&lp->cisco_timer,
+		    isdn_net_ciscohdlck_slarp_send_keepalive,
+		    (unsigned long)lp);
 	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
 	add_timer(&lp->cisco_timer);
 }
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c
index 1839a86cc2a5..e179b33d3775 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c
@@ -1314,9 +1314,8 @@ static int s5p_mfc_probe(struct platform_device *pdev)
 	dev->hw_lock = 0;
 	INIT_WORK(&dev->watchdog_work, s5p_mfc_watchdog_worker);
 	atomic_set(&dev->watchdog_cnt, 0);
-	init_timer(&dev->watchdog_timer);
-	dev->watchdog_timer.data = (unsigned long)dev;
-	dev->watchdog_timer.function = s5p_mfc_watchdog;
+	setup_timer(&dev->watchdog_timer, s5p_mfc_watchdog,
+		    (unsigned long)dev);
 
 	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
 	if (ret)
diff --git a/drivers/media/usb/au0828/au0828-dvb.c b/drivers/media/usb/au0828/au0828-dvb.c
index 34dc7e062471..d701c04b3783 100644
--- a/drivers/media/usb/au0828/au0828-dvb.c
+++ b/drivers/media/usb/au0828/au0828-dvb.c
@@ -648,9 +648,8 @@ int au0828_dvb_register(struct au0828_dev *dev)
 		return ret;
 	}
 
-	dev->bulk_timeout.function = au0828_bulk_timeout;
-	dev->bulk_timeout.data = (unsigned long) dev;
-	init_timer(&dev->bulk_timeout);
+	setup_timer(&dev->bulk_timeout, au0828_bulk_timeout,
+		    (unsigned long)dev);
 
 	return 0;
 }
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index 1a8d8db80b05..f9d047314692 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -1189,10 +1189,8 @@ static struct sta_info * ap_add_sta(struct ap_data *ap, u8 *addr)
 	}
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
-	init_timer(&sta->timer);
+	setup_timer(&sta->timer, ap_handle_timer, (unsigned long)sta);
 	sta->timer.expires = jiffies + ap->max_inactivity;
-	sta->timer.data = (unsigned long) sta;
-	sta->timer.function = ap_handle_timer;
 	if (!ap->local->hostapd)
 		add_timer(&sta->timer);
 #endif /* PRISM2_NO_KERNEL_IEEE80211_MGMT */
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 72b46eaf3de2..8177fd6f65c1 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -3225,13 +3225,10 @@ while (0)
 
 	lib80211_crypt_info_init(&local->crypt_info, dev->name, &local->lock);
 
-	init_timer(&local->passive_scan_timer);
-	local->passive_scan_timer.data = (unsigned long) local;
-	local->passive_scan_timer.function = hostap_passive_scan;
-
-	init_timer(&local->tick_timer);
-	local->tick_timer.data = (unsigned long) local;
-	local->tick_timer.function = hostap_tick_timer;
+	setup_timer(&local->passive_scan_timer, hostap_passive_scan,
+		    (unsigned long)local);
+	setup_timer(&local->tick_timer, hostap_tick_timer,
+		    (unsigned long)local);
 	local->tick_timer.expires = jiffies + 2 * HZ;
 	add_timer(&local->tick_timer);
 
diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index c05cb637ba92..2effa5ff7082 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -2632,9 +2632,8 @@ struct pn533 *pn533_register_device(u32 device_type,
 	if (priv->wq == NULL)
 		goto error;
 
-	init_timer(&priv->listen_timer);
-	priv->listen_timer.data = (unsigned long) priv;
-	priv->listen_timer.function = pn533_listen_mode_timer;
+	setup_timer(&priv->listen_timer, pn533_listen_mode_timer,
+		    (unsigned long)priv);
 
 	skb_queue_head_init(&priv->resp_q);
 	skb_queue_head_init(&priv->fragment_skb);
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
index 9477994cf975..93a7536a9af9 100644
--- a/drivers/nfc/st-nci/ndlc.c
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -282,13 +282,8 @@ int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
 	*ndlc_id = ndlc;
 
 	/* initialize timers */
-	init_timer(&ndlc->t1_timer);
-	ndlc->t1_timer.data = (unsigned long)ndlc;
-	ndlc->t1_timer.function = ndlc_t1_timeout;
-
-	init_timer(&ndlc->t2_timer);
-	ndlc->t2_timer.data = (unsigned long)ndlc;
-	ndlc->t2_timer.function = ndlc_t2_timeout;
+	setup_timer(&ndlc->t1_timer, ndlc_t1_timeout, (unsigned long)ndlc);
+	setup_timer(&ndlc->t2_timer, ndlc_t2_timeout, (unsigned long)ndlc);
 
 	skb_queue_head_init(&ndlc->rcv_q);
 	skb_queue_head_init(&ndlc->send_q);
diff --git a/drivers/nfc/st-nci/se.c b/drivers/nfc/st-nci/se.c
index 56f2112e0cd8..bd7c1e83169c 100644
--- a/drivers/nfc/st-nci/se.c
+++ b/drivers/nfc/st-nci/se.c
@@ -725,15 +725,12 @@ int st_nci_se_init(struct nci_dev *ndev, struct st_nci_se_status *se_status)
 
 	init_completion(&info->se_info.req_completion);
 	/* initialize timers */
-	init_timer(&info->se_info.bwi_timer);
-	info->se_info.bwi_timer.data = (unsigned long)info;
-	info->se_info.bwi_timer.function = st_nci_se_wt_timeout;
+	setup_timer(&info->se_info.bwi_timer, st_nci_se_wt_timeout,
+		    (unsigned long)info);
 	info->se_info.bwi_active = false;
 
-	init_timer(&info->se_info.se_active_timer);
-	info->se_info.se_active_timer.data = (unsigned long)info;
-	info->se_info.se_active_timer.function =
-			st_nci_se_activation_timeout;
+	setup_timer(&info->se_info.se_active_timer,
+		    st_nci_se_activation_timeout, (unsigned long)info);
 	info->se_info.se_active = false;
 
 	info->se_info.xch_error = false;
diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c
index 3a98563d4a12..b2285455f77d 100644
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -392,14 +392,12 @@ void st21nfca_se_init(struct nfc_hci_dev *hdev)
 
 	init_completion(&info->se_info.req_completion);
 	/* initialize timers */
-	init_timer(&info->se_info.bwi_timer);
-	info->se_info.bwi_timer.data = (unsigned long)info;
-	info->se_info.bwi_timer.function = st21nfca_se_wt_timeout;
+	setup_timer(&info->se_info.bwi_timer, st21nfca_se_wt_timeout,
+		    (unsigned long)info);
 	info->se_info.bwi_active = false;
 
-	init_timer(&info->se_info.se_active_timer);
-	info->se_info.se_active_timer.data = (unsigned long)info;
-	info->se_info.se_active_timer.function = st21nfca_se_activation_timeout;
+	setup_timer(&info->se_info.se_active_timer,
+		    st21nfca_se_activation_timeout, (unsigned long)info);
 	info->se_info.se_active = false;
 
 	info->se_info.count_pipes = 0;
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 29f35e29d480..adba91318768 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -119,9 +119,8 @@ struct dasd_device *dasd_alloc_device(void)
 		     (void (*)(unsigned long)) dasd_device_tasklet,
 		     (unsigned long) device);
 	INIT_LIST_HEAD(&device->ccw_queue);
-	init_timer(&device->timer);
-	device->timer.function = dasd_device_timeout;
-	device->timer.data = (unsigned long) device;
+	setup_timer(&device->timer, dasd_device_timeout,
+		    (unsigned long)device);
 	INIT_WORK(&device->kick_work, do_kick_device);
 	INIT_WORK(&device->restore_device, do_restore_device);
 	INIT_WORK(&device->reload_device, do_reload_device);
@@ -163,9 +162,7 @@ struct dasd_block *dasd_alloc_block(void)
 		     (unsigned long) block);
 	INIT_LIST_HEAD(&block->ccw_queue);
 	spin_lock_init(&block->queue_lock);
-	init_timer(&block->timer);
-	block->timer.function = dasd_block_timeout;
-	block->timer.data = (unsigned long) block;
+	setup_timer(&block->timer, dasd_block_timeout, (unsigned long)block);
 	spin_lock_init(&block->profile.lock);
 
 	return block;
diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c
index 8c14c6c3ad3d..16b81be1f07a 100644
--- a/drivers/s390/net/fsm.c
+++ b/drivers/s390/net/fsm.c
@@ -142,13 +142,11 @@ void
 fsm_settimer(fsm_instance *fi, fsm_timer *this)
 {
 	this->fi = fi;
-	this->tl.function = (void *)fsm_expire_timer;
-	this->tl.data = (long)this;
 #if FSM_TIMER_DEBUG
 	printk(KERN_DEBUG "fsm(%s): Create timer %p\n", fi->name,
 	       this);
 #endif
-	init_timer(&this->tl);
+	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
 }
 
 void
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index af032c46ec0e..a54b6c11b505 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -837,10 +837,9 @@ static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	init_timer(&acb->eternal_timer);
+	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
+		    (unsigned long)acb);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
-	acb->eternal_timer.data = (unsigned long) acb;
-	acb->eternal_timer.function = &arcmsr_request_device_map;
 	add_timer(&acb->eternal_timer);
 	if(arcmsr_alloc_sysfs_attr(acb))
 		goto out_free_sysfs;
@@ -930,10 +929,9 @@ static int arcmsr_resume(struct pci_dev *pdev)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	init_timer(&acb->eternal_timer);
+	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
+		    (unsigned long)acb);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
-	acb->eternal_timer.data = (unsigned long) acb;
-	acb->eternal_timer.function = &arcmsr_request_device_map;
 	add_timer(&acb->eternal_timer);
 	return 0;
 controller_stop:
diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index 24388795ee9a..7304d5a4fc4f 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -2849,9 +2849,7 @@ int fas216_init(struct Scsi_Host *host)
 	info->rst_dev_status = -1;
 	info->rst_bus_status = -1;
 	init_waitqueue_head(&info->eh_wait);
-	init_timer(&info->eh_timer);
-	info->eh_timer.data  = (unsigned long)info;
-	info->eh_timer.function = fas216_eh_timer;
+	setup_timer(&info->eh_timer, fas216_eh_timer, (unsigned long)info);
 	
 	spin_lock_init(&info->host_lock);
 
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index 5caf5f3ff642..d10826a69725 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -719,9 +719,7 @@ bfad_bfa_tmo(unsigned long data)
 void
 bfad_init_timer(struct bfad_s *bfad)
 {
-	init_timer(&bfad->hal_tmo);
-	bfad->hal_tmo.function = bfad_bfa_tmo;
-	bfad->hal_tmo.data = (unsigned long)bfad;
+	setup_timer(&bfad->hal_tmo, bfad_bfa_tmo, (unsigned long)bfad);
 
 	mod_timer(&bfad->hal_tmo,
 		  jiffies + msecs_to_jiffies(BFA_TIMER_FREQ));
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index 81f226be3e3b..af4af504a97f 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -1635,10 +1635,8 @@ static void esas2r_timer_callback(unsigned long context);
 
 void esas2r_kickoff_timer(struct esas2r_adapter *a)
 {
-	init_timer(&a->timer);
+	setup_timer(&a->timer, esas2r_timer_callback, (unsigned long)a);
 
-	a->timer.function = esas2r_timer_callback;
-	a->timer.data = (unsigned long)a;
 	a->timer.expires = jiffies +
 			   msecs_to_jiffies(100);
 
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index 5b93ed810f6e..017216f5e919 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -8357,9 +8357,7 @@ struct Scsi_Host * __init ncr_attach(struct scsi_host_template *tpnt,
 	if (!np->scripth0)
 		goto attach_error;
 
-	init_timer(&np->timer);
-	np->timer.data     = (unsigned long) np;
-	np->timer.function = ncr53c8xx_timeout;
+	setup_timer(&np->timer, ncr53c8xx_timeout, (unsigned long)np);
 
 	/* Try to map the controller chip to virtual and physical memory. */
 
diff --git a/drivers/scsi/sym53c8xx_2/sym_glue.c b/drivers/scsi/sym53c8xx_2/sym_glue.c
index d32e3ba8863e..285397d42558 100644
--- a/drivers/scsi/sym53c8xx_2/sym_glue.c
+++ b/drivers/scsi/sym53c8xx_2/sym_glue.c
@@ -1351,9 +1351,7 @@ static struct Scsi_Host *sym_attach(struct scsi_host_template *tpnt, int unit,
 	/*
 	 *  Start the timer daemon
 	 */
-	init_timer(&np->s.timer);
-	np->s.timer.data     = (unsigned long) np;
-	np->s.timer.function = sym53c8xx_timer;
+	setup_timer(&np->s.timer, sym53c8xx_timer, (unsigned long)np);
 	np->s.lasttime=0;
 	sym_timer (np);
 
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index fc7f810baef7..fb8c4bff584c 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -2542,9 +2542,7 @@ omap_ep_setup(char *name, u8 addr, u8 type,
 		}
 		if (dbuf && addr)
 			epn_rxtx |= UDC_EPN_RX_DB;
-		init_timer(&ep->timer);
-		ep->timer.function = pio_out_timer;
-		ep->timer.data = (unsigned long) ep;
+		setup_timer(&ep->timer, pio_out_timer, (unsigned long)ep);
 	}
 	if (addr)
 		epn_rxtx |= UDC_EPN_RX_VALID;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..5b51d5ba2a85 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
-	init_timer(&watchdog_timer);
-	watchdog_timer.function = clocksource_watchdog;
+	setup_timer(&watchdog_timer, clocksource_watchdog, 0UL);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
-- 
cgit v1.3-14-g43fede


From e99e88a9d2b067465adaa9c111ada99a041bef9a Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 16 Oct 2017 14:43:17 -0700
Subject: treewide: setup_timer() -> timer_setup()

This converts all remaining cases of the old setup_timer() API into using
timer_setup(), where the callback argument is the structure already
holding the struct timer_list. These should have no behavioral changes,
since they just change which pointer is passed into the callback with
the same available pointers after conversion. It handles the following
examples, in addition to some other variations.

Casting from unsigned long:

    void my_callback(unsigned long data)
    {
        struct something *ptr = (struct something *)data;
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, ptr);

and forced object casts:

    void my_callback(struct something *ptr)
    {
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, (unsigned long)ptr);

become:

    void my_callback(struct timer_list *t)
    {
        struct something *ptr = from_timer(ptr, t, my_timer);
    ...
    }
    ...
    timer_setup(&ptr->my_timer, my_callback, 0);

Direct function assignments:

    void my_callback(unsigned long data)
    {
        struct something *ptr = (struct something *)data;
    ...
    }
    ...
    ptr->my_timer.function = my_callback;

have a temporary cast added, along with converting the args:

    void my_callback(struct timer_list *t)
    {
        struct something *ptr = from_timer(ptr, t, my_timer);
    ...
    }
    ...
    ptr->my_timer.function = (TIMER_FUNC_TYPE)my_callback;

And finally, callbacks without a data assignment:

    void my_callback(unsigned long data)
    {
    ...
    }
    ...
    setup_timer(&ptr->my_timer, my_callback, 0);

have their argument renamed to verify they're unused during conversion:

    void my_callback(struct timer_list *unused)
    {
    ...
    }
    ...
    timer_setup(&ptr->my_timer, my_callback, 0);

The conversion is done with the following Coccinelle script:

spatch --very-quiet --all-includes --include-headers \
	-I ./arch/x86/include -I ./arch/x86/include/generated \
	-I ./include -I ./arch/x86/include/uapi \
	-I ./arch/x86/include/generated/uapi -I ./include/uapi \
	-I ./include/generated/uapi --include ./include/linux/kconfig.h \
	--dir . \
	--cocci-file ~/src/data/timer_setup.cocci

@fix_address_of@
expression e;
@@

 setup_timer(
-&(e)
+&e
 , ...)

// Update any raw setup_timer() usages that have a NULL callback, but
// would otherwise match change_timer_function_usage, since the latter
// will update all function assignments done in the face of a NULL
// function initialization in setup_timer().
@change_timer_function_usage_NULL@
expression _E;
identifier _timer;
type _cast_data;
@@

(
-setup_timer(&_E->_timer, NULL, _E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E->_timer, NULL, (_cast_data)_E);
+timer_setup(&_E->_timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, &_E);
+timer_setup(&_E._timer, NULL, 0);
|
-setup_timer(&_E._timer, NULL, (_cast_data)&_E);
+timer_setup(&_E._timer, NULL, 0);
)

@change_timer_function_usage@
expression _E;
identifier _timer;
struct timer_list _stl;
identifier _callback;
type _cast_func, _cast_data;
@@

(
-setup_timer(&_E->_timer, _callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, &_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, _E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, &_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)_E);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, (_cast_func)&_callback, (_cast_data)&_E);
+timer_setup(&_E._timer, _callback, 0);
|
 _E->_timer@_stl.function = _callback;
|
 _E->_timer@_stl.function = &_callback;
|
 _E->_timer@_stl.function = (_cast_func)_callback;
|
 _E->_timer@_stl.function = (_cast_func)&_callback;
|
 _E._timer@_stl.function = _callback;
|
 _E._timer@_stl.function = &_callback;
|
 _E._timer@_stl.function = (_cast_func)_callback;
|
 _E._timer@_stl.function = (_cast_func)&_callback;
)

// callback(unsigned long arg)
@change_callback_handle_cast
 depends on change_timer_function_usage@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
identifier _handle;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *t
 )
 {
(
	... when != _origarg
	_handletype *_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle;
	... when != _handle
	_handle =
-(_handletype *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
|
	... when != _origarg
	_handletype *_handle;
	... when != _handle
	_handle =
-(void *)_origarg;
+from_timer(_handle, t, _timer);
	... when != _origarg
)
 }

// callback(unsigned long arg) without existing variable
@change_callback_handle_cast_no_arg
 depends on change_timer_function_usage &&
                     !change_callback_handle_cast@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _origtype;
identifier _origarg;
type _handletype;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *t
 )
 {
+	_handletype *_origarg = from_timer(_origarg, t, _timer);
+
	... when != _origarg
-	(_handletype *)_origarg
+	_origarg
	... when != _origarg
 }

// Avoid already converted callbacks.
@match_callback_converted
 depends on change_timer_function_usage &&
            !change_callback_handle_cast &&
	    !change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier t;
@@

 void _callback(struct timer_list *t)
 { ... }

// callback(struct something *handle)
@change_callback_handle_arg
 depends on change_timer_function_usage &&
	    !match_callback_converted &&
            !change_callback_handle_cast &&
            !change_callback_handle_cast_no_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
@@

 void _callback(
-_handletype *_handle
+struct timer_list *t
 )
 {
+	_handletype *_handle = from_timer(_handle, t, _timer);
	...
 }

// If change_callback_handle_arg ran on an empty function, remove
// the added handler.
@unchange_callback_handle_arg
 depends on change_timer_function_usage &&
	    change_callback_handle_arg@
identifier change_timer_function_usage._callback;
identifier change_timer_function_usage._timer;
type _handletype;
identifier _handle;
identifier t;
@@

 void _callback(struct timer_list *t)
 {
-	_handletype *_handle = from_timer(_handle, t, _timer);
 }

// We only want to refactor the setup_timer() data argument if we've found
// the matching callback. This undoes changes in change_timer_function_usage.
@unchange_timer_function_usage
 depends on change_timer_function_usage &&
            !change_callback_handle_cast &&
            !change_callback_handle_cast_no_arg &&
	    !change_callback_handle_arg@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type change_timer_function_usage._cast_data;
@@

(
-timer_setup(&_E->_timer, _callback, 0);
+setup_timer(&_E->_timer, _callback, (_cast_data)_E);
|
-timer_setup(&_E._timer, _callback, 0);
+setup_timer(&_E._timer, _callback, (_cast_data)&_E);
)

// If we fixed a callback from a .function assignment, fix the
// assignment cast now.
@change_timer_function_assignment
 depends on change_timer_function_usage &&
            (change_callback_handle_cast ||
             change_callback_handle_cast_no_arg ||
             change_callback_handle_arg)@
expression change_timer_function_usage._E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_func;
typedef TIMER_FUNC_TYPE;
@@

(
 _E->_timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-(_cast_func)_callback;
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E->_timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-&_callback;
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-(_cast_func)_callback
+(TIMER_FUNC_TYPE)_callback
 ;
|
 _E._timer.function =
-(_cast_func)&_callback
+(TIMER_FUNC_TYPE)_callback
 ;
)

// Sometimes timer functions are called directly. Replace matched args.
@change_timer_function_calls
 depends on change_timer_function_usage &&
            (change_callback_handle_cast ||
             change_callback_handle_cast_no_arg ||
             change_callback_handle_arg)@
expression _E;
identifier change_timer_function_usage._timer;
identifier change_timer_function_usage._callback;
type _cast_data;
@@

 _callback(
(
-(_cast_data)_E
+&_E->_timer
|
-(_cast_data)&_E
+&_E._timer
|
-_E
+&_E->_timer
)
 )

// If a timer has been configured without a data argument, it can be
// converted without regard to the callback argument, since it is unused.
@match_timer_function_unused_data@
expression _E;
identifier _timer;
identifier _callback;
@@

(
-setup_timer(&_E->_timer, _callback, 0);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0L);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E->_timer, _callback, 0UL);
+timer_setup(&_E->_timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0L);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_E._timer, _callback, 0UL);
+timer_setup(&_E._timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0L);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(&_timer, _callback, 0UL);
+timer_setup(&_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0L);
+timer_setup(_timer, _callback, 0);
|
-setup_timer(_timer, _callback, 0UL);
+timer_setup(_timer, _callback, 0);
)

@change_callback_unused_data
 depends on match_timer_function_unused_data@
identifier match_timer_function_unused_data._callback;
type _origtype;
identifier _origarg;
@@

 void _callback(
-_origtype _origarg
+struct timer_list *unused
 )
 {
	... when != _origarg
 }

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 arch/alpha/kernel/srmcons.c                        |  7 ++---
 arch/arm/mach-iop32x/n2100.c                       |  4 +--
 arch/arm/mach-orion5x/db88f5281-setup.c            |  4 +--
 arch/blackfin/kernel/nmi.c                         |  4 +--
 arch/mips/lasat/picvue_proc.c                      |  4 +--
 arch/powerpc/kernel/tau_6xx.c                      |  4 +--
 arch/powerpc/oprofile/op_model_cell.c              |  8 +++---
 arch/powerpc/platforms/cell/spufs/sched.c          |  8 +++---
 arch/powerpc/platforms/powermac/low_i2c.c          |  6 ++--
 arch/s390/kernel/time.c                            |  4 +--
 arch/sh/drivers/heartbeat.c                        |  6 ++--
 arch/sh/drivers/pci/common.c                       | 14 ++++-----
 arch/sh/drivers/push-switch.c                      |  6 ++--
 block/blk-stat.c                                   |  6 ++--
 block/blk-throttle.c                               |  9 +++---
 drivers/atm/ambassador.c                           |  9 +++---
 drivers/atm/firestream.c                           |  6 ++--
 drivers/atm/horizon.c                              |  8 +++---
 drivers/atm/idt77252.c                             |  6 ++--
 drivers/atm/lanai.c                                |  6 ++--
 drivers/atm/nicstar.c                              |  6 ++--
 drivers/block/DAC960.c                             |  8 +++---
 drivers/block/DAC960.h                             |  2 +-
 drivers/block/rsxx/dma.c                           |  7 ++---
 drivers/block/skd_main.c                           |  6 ++--
 drivers/block/sunvdc.c                             |  9 +++---
 drivers/block/umem.c                               |  4 +--
 drivers/block/xsysace.c                            |  6 ++--
 drivers/char/ipmi/bt-bmc.c                         |  7 ++---
 drivers/char/ipmi/ipmi_msghandler.c                |  4 +--
 drivers/char/ipmi/ipmi_si_intf.c                   |  6 ++--
 drivers/char/ipmi/ipmi_ssif.c                      |  7 ++---
 drivers/char/tpm/tpm-dev-common.c                  |  7 ++---
 drivers/gpu/drm/drm_vblank.c                       | 11 ++++----
 drivers/gpu/drm/exynos/exynos_drm_vidi.c           |  6 ++--
 drivers/gpu/drm/i2c/tda998x_drv.c                  |  7 ++---
 drivers/gpu/drm/msm/adreno/a5xx_preempt.c          |  7 ++---
 drivers/gpu/drm/msm/msm_gpu.c                      |  7 ++---
 drivers/gpu/drm/omapdrm/dss/dsi.c                  |  4 +--
 drivers/gpu/drm/rockchip/rockchip_drm_psr.c        |  6 ++--
 drivers/gpu/drm/vgem/vgem_fence.c                  |  6 ++--
 drivers/gpu/drm/via/via_dmablit.c                  |  7 ++---
 drivers/hid/hid-appleir.c                          |  7 ++---
 drivers/hid/hid-prodikeys.c                        |  7 ++---
 drivers/hid/hid-wiimote-core.c                     |  6 ++--
 drivers/iio/common/ssp_sensors/ssp_dev.c           |  6 ++--
 drivers/infiniband/hw/mlx5/mr.c                    |  6 ++--
 drivers/input/gameport/gameport.c                  |  7 ++---
 drivers/input/joystick/db9.c                       |  6 ++--
 drivers/input/joystick/gamecon.c                   |  6 ++--
 drivers/input/joystick/turbografx.c                |  6 ++--
 drivers/iommu/iova.c                               |  8 +++---
 drivers/isdn/capi/capidrv.c                        |  6 ++--
 drivers/isdn/divert/isdn_divert.c                  |  9 +++---
 drivers/isdn/hardware/eicon/divasi.c               |  9 +++---
 drivers/isdn/hardware/mISDN/hfcmulti.c             |  8 ++----
 drivers/isdn/hardware/mISDN/hfcpci.c               |  5 ++--
 drivers/isdn/hardware/mISDN/mISDNisar.c            | 10 +++----
 drivers/isdn/i4l/isdn_common.c                     |  4 +--
 drivers/isdn/i4l/isdn_net.c                        |  9 +++---
 drivers/isdn/i4l/isdn_ppp.c                        |  9 +++---
 drivers/isdn/i4l/isdn_tty.c                        |  7 ++---
 drivers/media/platform/s5p-mfc/s5p_mfc.c           |  7 ++---
 .../media/platform/sti/c8sectpfe/c8sectpfe-core.c  |  7 ++---
 drivers/media/platform/vim2m.c                     |  6 ++--
 drivers/media/usb/au0828/au0828-dvb.c              |  7 ++---
 drivers/media/usb/au0828/au0828-video.c            | 14 ++++-----
 drivers/memstick/core/ms_block.c                   |  7 ++---
 drivers/mfd/rtsx_usb.c                             |  6 ++--
 drivers/mmc/core/host.c                            |  6 ++--
 drivers/mtd/sm_ftl.c                               |  6 ++--
 drivers/net/caif/caif_hsi.c                        | 21 ++++++--------
 drivers/net/dsa/mv88e6xxx/phy.c                    |  7 ++---
 drivers/net/eql.c                                  |  6 ++--
 drivers/net/ethernet/adi/bfin_mac.c                |  9 +++---
 drivers/net/ethernet/agere/et131x.c                |  7 ++---
 drivers/net/ethernet/amazon/ena/ena_netdev.c       |  7 ++---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c    | 14 ++++-----
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c    |  8 +++---
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c    |  8 +++---
 drivers/net/ethernet/atheros/atlx/atl1.c           |  8 +++---
 drivers/net/ethernet/atheros/atlx/atl2.c           | 15 +++++-----
 drivers/net/ethernet/broadcom/b44.c                |  6 ++--
 drivers/net/ethernet/broadcom/bnx2.c               |  6 ++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  6 ++--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c          |  6 ++--
 drivers/net/ethernet/broadcom/tg3.c                |  6 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c        |  7 ++---
 drivers/net/ethernet/marvell/mv643xx_eth.c         | 13 ++++-----
 drivers/net/ethernet/marvell/pxa168_eth.c          |  7 ++---
 drivers/net/ethernet/marvell/skge.c                |  6 ++--
 drivers/net/ethernet/marvell/sky2.c                |  6 ++--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c   |  7 ++---
 .../net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c   |  8 +++---
 drivers/net/ethernet/pasemi/pasemi_mac.c           |  7 ++---
 drivers/net/ethernet/qlogic/qla3xxx.c              |  6 ++--
 drivers/net/ethernet/rocker/rocker_ofdpa.c         |  7 ++---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 15 +++++-----
 drivers/net/ethernet/synopsys/dwc-xlgmac-net.c     |  7 ++---
 drivers/net/ethernet/ti/cpsw_ale.c                 |  6 ++--
 drivers/net/ethernet/ti/netcp_ethss.c              |  7 ++---
 drivers/net/ethernet/toshiba/spider_net.c          | 16 +++++------
 drivers/net/slip/slip.c                            | 16 +++++------
 drivers/net/tun.c                                  |  8 ++++--
 drivers/net/wan/hdlc_ppp.c                         |  6 ++--
 .../wireless/broadcom/brcm80211/brcmfmac/btcoex.c  |  6 ++--
 .../broadcom/brcm80211/brcmfmac/cfg80211.c         |  7 ++---
 .../wireless/broadcom/brcm80211/brcmfmac/sdio.c    |  7 ++---
 drivers/net/wireless/intel/iwlwifi/dvm/main.c      | 14 ++++-----
 drivers/net/wireless/intel/iwlwifi/pcie/tx.c       |  7 ++---
 drivers/net/wireless/intersil/hostap/hostap_ap.c   |  6 ++--
 drivers/net/wireless/intersil/hostap/hostap_hw.c   | 14 ++++-----
 .../net/wireless/intersil/orinoco/orinoco_usb.c    |  6 ++--
 drivers/net/wireless/quantenna/qtnfmac/core.c      |  2 +-
 drivers/net/wireless/ti/wlcore/main.c              |  7 ++---
 drivers/net/xen-netfront.c                         |  7 ++---
 drivers/nfc/pn533/pn533.c                          |  7 ++---
 drivers/nfc/st-nci/ndlc.c                          | 12 ++++----
 drivers/ntb/test/ntb_pingpong.c                    |  8 +++---
 drivers/platform/x86/sony-laptop.c                 |  4 +--
 drivers/pps/clients/pps-ktimer.c                   |  4 +--
 drivers/rtc/rtc-dev.c                              |  6 ++--
 drivers/s390/block/dasd.c                          | 17 ++++++-----
 drivers/s390/net/fsm.c                             |  9 +++---
 drivers/scsi/arcmsr/arcmsr_hba.c                   | 12 ++++----
 drivers/scsi/arm/fas216.c                          |  6 ++--
 drivers/scsi/bfa/bfad.c                            |  6 ++--
 drivers/scsi/bfa/bfad_drv.h                        |  2 +-
 drivers/scsi/bnx2fc/bnx2fc_tgt.c                   | 16 +++++------
 drivers/scsi/esas2r/esas2r_main.c                  |  8 +++---
 drivers/scsi/fcoe/fcoe_ctlr.c                      |  8 +++---
 drivers/scsi/fnic/fnic_main.c                      | 14 ++++-----
 drivers/scsi/ncr53c8xx.c                           |  6 ++--
 drivers/staging/greybus/operation.c                |  7 ++---
 drivers/staging/lustre/lnet/lnet/net_fault.c       |  6 ++--
 drivers/staging/lustre/lustre/ptlrpc/service.c     |  9 +++---
 drivers/staging/media/imx/imx-ic-prpencvf.c        |  7 ++---
 drivers/staging/media/imx/imx-media-csi.c          |  7 ++---
 drivers/staging/most/hdm-usb/hdm_usb.c             |  7 ++---
 .../staging/rtl8192u/ieee80211/ieee80211_softmac.c | 16 +++++------
 drivers/staging/rtl8712/recv_linux.c               |  9 +++---
 drivers/staging/rtl8712/rtl8712_led.c              |  9 +++---
 drivers/staging/unisys/visorbus/visorbus_main.c    |  6 ++--
 drivers/staging/unisys/visornic/visornic_main.c    |  8 +++---
 drivers/staging/wilc1000/wilc_wfi_cfgoperations.c  |  8 +++---
 drivers/target/target_core_user.c                  |  7 ++---
 drivers/tty/ipwireless/hardware.c                  | 11 ++++----
 drivers/tty/n_gsm.c                                | 12 ++++----
 drivers/tty/n_r3964.c                              |  8 +++---
 drivers/tty/serial/crisv10.c                       |  4 +--
 drivers/tty/serial/fsl_lpuart.c                    |  7 ++---
 drivers/tty/serial/ifx6x60.c                       |  7 ++---
 drivers/tty/serial/imx.c                           |  6 ++--
 drivers/tty/serial/kgdb_nmi.c                      |  6 ++--
 drivers/tty/serial/max3100.c                       |  7 ++---
 drivers/tty/serial/mux.c                           |  4 +--
 drivers/tty/serial/pnx8xxx_uart.c                  |  7 ++---
 drivers/tty/serial/sa1100.c                        |  7 ++---
 drivers/tty/serial/sh-sci.c                        | 16 +++++------
 drivers/tty/serial/sn_console.c                    |  6 ++--
 drivers/tty/synclink.c                             |  8 +++---
 drivers/tty/synclink_gt.c                          | 16 +++++------
 drivers/tty/synclinkmp.c                           | 17 ++++++-----
 drivers/usb/core/hcd.c                             |  8 ++++--
 drivers/usb/dwc2/hcd.c                             |  7 ++---
 drivers/usb/dwc2/hcd_queue.c                       |  7 ++---
 drivers/usb/gadget/udc/at91_udc.c                  |  7 ++---
 drivers/usb/gadget/udc/dummy_hcd.c                 |  8 +++---
 drivers/usb/gadget/udc/m66592-udc.c                |  6 ++--
 drivers/usb/gadget/udc/omap_udc.c                  |  6 ++--
 drivers/usb/gadget/udc/pxa25x_udc.c                |  6 ++--
 drivers/usb/gadget/udc/r8a66597-udc.c              |  6 ++--
 drivers/usb/host/ohci-hcd.c                        |  9 +++---
 drivers/usb/host/oxu210hp-hcd.c                    |  6 ++--
 drivers/usb/host/r8a66597-hcd.c                    |  7 ++---
 drivers/usb/host/sl811-hcd.c                       |  6 ++--
 drivers/usb/host/uhci-hcd.c                        |  3 +-
 drivers/usb/host/uhci-q.c                          |  4 +--
 drivers/usb/host/xhci.c                            |  8 +++---
 drivers/usb/serial/mos7840.c                       | 15 +++++-----
 drivers/usb/storage/realtek_cr.c                   |  7 ++---
 drivers/uwb/drp.c                                  |  6 ++--
 drivers/uwb/neh.c                                  |  8 +++---
 drivers/uwb/rsv.c                                  | 15 +++++-----
 drivers/uwb/uwb-internal.h                         |  2 +-
 drivers/watchdog/at91sam9_wdt.c                    |  6 ++--
 drivers/watchdog/bcm47xx_wdt.c                     |  9 +++---
 drivers/watchdog/bcm63xx_wdt.c                     |  4 +--
 drivers/watchdog/cpu5wdt.c                         |  4 +--
 drivers/watchdog/mpc8xxx_wdt.c                     |  7 ++---
 drivers/watchdog/mtx-1_wdt.c                       |  4 +--
 drivers/watchdog/nuc900_wdt.c                      |  4 +--
 drivers/watchdog/pcwd.c                            |  4 +--
 drivers/watchdog/pika_wdt.c                        |  4 +--
 drivers/watchdog/rdc321x_wdt.c                     |  4 +--
 drivers/watchdog/shwdt.c                           |  6 ++--
 fs/ocfs2/cluster/tcp.c                             |  9 +++---
 kernel/padata.c                                    |  6 ++--
 kernel/time/clocksource.c                          |  4 +--
 net/802/garp.c                                     |  6 ++--
 net/802/mrp.c                                      | 13 ++++-----
 net/appletalk/aarp.c                               |  4 +--
 net/appletalk/ddp.c                                |  7 ++---
 net/batman-adv/tp_meter.c                          | 14 ++++-----
 net/bluetooth/hidp/core.c                          |  7 ++---
 net/bluetooth/rfcomm/core.c                        | 12 ++++----
 net/bluetooth/sco.c                                |  6 ++--
 net/core/drop_monitor.c                            |  7 ++---
 net/core/gen_estimator.c                           |  6 ++--
 net/core/neighbour.c                               | 14 ++++-----
 net/decnet/dn_route.c                              |  4 +--
 net/decnet/dn_timer.c                              |  8 +++---
 net/ipv4/igmp.c                                    | 20 ++++++-------
 net/ipv4/ipmr.c                                    |  9 +++---
 net/ipv6/addrconf.c                                |  9 +++---
 net/ipv6/ip6mr.c                                   |  9 +++---
 net/ipv6/mcast.c                                   | 33 ++++++++++------------
 net/ncsi/ncsi-manage.c                             |  8 ++----
 net/netfilter/nf_conntrack_expect.c                |  7 ++---
 net/netfilter/nfnetlink_log.c                      |  8 +++---
 net/netfilter/xt_IDLETIMER.c                       |  7 ++---
 net/netfilter/xt_LED.c                             |  8 +++---
 net/nfc/nci/core.c                                 | 14 ++++-----
 net/rxrpc/call_object.c                            |  7 ++---
 net/wireless/lib80211.c                            | 11 ++++----
 net/x25/x25_link.c                                 |  8 +++---
 net/xfrm/xfrm_state.c                              |  9 +++---
 227 files changed, 824 insertions(+), 937 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c
index 5da0aec8ce90..438b10c44d73 100644
--- a/arch/alpha/kernel/srmcons.c
+++ b/arch/alpha/kernel/srmcons.c
@@ -65,9 +65,9 @@ srmcons_do_receive_chars(struct tty_port *port)
 }
 
 static void
-srmcons_receive_chars(unsigned long data)
+srmcons_receive_chars(struct timer_list *t)
 {
-	struct srmcons_private *srmconsp = (struct srmcons_private *)data;
+	struct srmcons_private *srmconsp = from_timer(srmconsp, t, timer);
 	struct tty_port *port = &srmconsp->port;
 	unsigned long flags;
 	int incr = 10;
@@ -206,8 +206,7 @@ static const struct tty_operations srmcons_ops = {
 static int __init
 srmcons_init(void)
 {
-	setup_timer(&srmcons_singleton.timer, srmcons_receive_chars,
-			(unsigned long)&srmcons_singleton);
+	timer_setup(&srmcons_singleton.timer, srmcons_receive_chars, 0);
 	if (srm_is_registered_console) {
 		struct tty_driver *driver;
 		int err;
diff --git a/arch/arm/mach-iop32x/n2100.c b/arch/arm/mach-iop32x/n2100.c
index 4a64a11ba63c..3b73813c6b04 100644
--- a/arch/arm/mach-iop32x/n2100.c
+++ b/arch/arm/mach-iop32x/n2100.c
@@ -305,7 +305,7 @@ static void n2100_restart(enum reboot_mode mode, const char *cmd)
 
 static struct timer_list power_button_poll_timer;
 
-static void power_button_poll(unsigned long dummy)
+static void power_button_poll(struct timer_list *unused)
 {
 	if (gpio_get_value(N2100_POWER_BUTTON) == 0) {
 		ctrl_alt_del();
@@ -336,7 +336,7 @@ static int __init n2100_request_gpios(void)
 			pr_err("could not set power GPIO as input\n");
 	}
 	/* Set up power button poll timer */
-	setup_timer(&power_button_poll_timer, power_button_poll, 0UL);
+	timer_setup(&power_button_poll_timer, power_button_poll, 0);
 	power_button_poll_timer.expires = jiffies + (HZ / 10);
 	add_timer(&power_button_poll_timer);
 	return 0;
diff --git a/arch/arm/mach-orion5x/db88f5281-setup.c b/arch/arm/mach-orion5x/db88f5281-setup.c
index 3f5863de766a..39eae10ac8de 100644
--- a/arch/arm/mach-orion5x/db88f5281-setup.c
+++ b/arch/arm/mach-orion5x/db88f5281-setup.c
@@ -172,7 +172,7 @@ static struct platform_device db88f5281_nand_flash = {
 static void __iomem *db88f5281_7seg;
 static struct timer_list db88f5281_timer;
 
-static void db88f5281_7seg_event(unsigned long data)
+static void db88f5281_7seg_event(struct timer_list *unused)
 {
 	static int count = 0;
 	writel(0, db88f5281_7seg + (count << 4));
@@ -189,7 +189,7 @@ static int __init db88f5281_7seg_init(void)
 			printk(KERN_ERR "Failed to ioremap db88f5281_7seg\n");
 			return -EIO;
 		}
-		setup_timer(&db88f5281_timer, db88f5281_7seg_event, 0);
+		timer_setup(&db88f5281_timer, db88f5281_7seg_event, 0);
 		mod_timer(&db88f5281_timer, jiffies + 2 * HZ);
 	}
 
diff --git a/arch/blackfin/kernel/nmi.c b/arch/blackfin/kernel/nmi.c
index 828f4fbdb58a..8a211d95821f 100644
--- a/arch/blackfin/kernel/nmi.c
+++ b/arch/blackfin/kernel/nmi.c
@@ -166,7 +166,7 @@ int check_nmi_wdt_touched(void)
 	return 1;
 }
 
-static void nmi_wdt_timer(unsigned long data)
+static void nmi_wdt_timer(struct timer_list *unused)
 {
 	if (check_nmi_wdt_touched())
 		nmi_wdt_keepalive();
@@ -180,7 +180,7 @@ static int __init init_nmi_wdt(void)
 	nmi_wdt_start();
 	nmi_active = true;
 
-	setup_timer(&ntimer, nmi_wdt_timer, 0UL);
+	timer_setup(&ntimer, nmi_wdt_timer, 0);
 	ntimer.expires = jiffies + NMI_CHECK_TIMEOUT;
 	add_timer(&ntimer);
 
diff --git a/arch/mips/lasat/picvue_proc.c b/arch/mips/lasat/picvue_proc.c
index a8103f6972cd..5d89e1ec5fcc 100644
--- a/arch/mips/lasat/picvue_proc.c
+++ b/arch/mips/lasat/picvue_proc.c
@@ -156,7 +156,7 @@ static const struct file_operations pvc_scroll_proc_fops = {
 	.write		= pvc_scroll_proc_write,
 };
 
-void pvc_proc_timerfunc(unsigned long data)
+void pvc_proc_timerfunc(struct timer_list *unused)
 {
 	if (scroll_dir < 0)
 		pvc_move(DISPLAY|RIGHT);
@@ -197,7 +197,7 @@ static int __init pvc_proc_init(void)
 	if (proc_entry == NULL)
 		goto error;
 
-	setup_timer(&timer, pvc_proc_timerfunc, 0UL);
+	timer_setup(&timer, pvc_proc_timerfunc, 0);
 
 	return 0;
 error:
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index e3c5f75d137c..8cdd852aedd1 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -188,7 +188,7 @@ static void tau_timeout(void * info)
 	local_irq_restore(flags);
 }
 
-static void tau_timeout_smp(unsigned long unused)
+static void tau_timeout_smp(struct timer_list *unused)
 {
 
 	/* schedule ourselves to be run again */
@@ -230,7 +230,7 @@ int __init TAU_init(void)
 
 
 	/* first, set up the window shrinking timer */
-	setup_timer(&tau_timer, tau_timeout_smp, 0UL);
+	timer_setup(&tau_timer, tau_timeout_smp, 0);
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 264b6ab11978..b90a21bc2f3f 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -451,7 +451,7 @@ static inline void enable_ctr(u32 cpu, u32 ctr, u32 *pm07_cntrl)
  * This routine will alternate loading the virtual counters for
  * virtual CPUs
  */
-static void cell_virtual_cntr(unsigned long data)
+static void cell_virtual_cntr(struct timer_list *unused)
 {
 	int i, prev_hdw_thread, next_hdw_thread;
 	u32 cpu;
@@ -555,7 +555,7 @@ static void cell_virtual_cntr(unsigned long data)
 
 static void start_virt_cntrs(void)
 {
-	setup_timer(&timer_virt_cntr, cell_virtual_cntr, 0UL);
+	timer_setup(&timer_virt_cntr, cell_virtual_cntr, 0);
 	timer_virt_cntr.expires = jiffies + HZ / 10;
 	add_timer(&timer_virt_cntr);
 }
@@ -587,7 +587,7 @@ static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr,
  * periodically based on kernel timer to switch which SPU is
  * being monitored in a round robbin fashion.
  */
-static void spu_evnt_swap(unsigned long data)
+static void spu_evnt_swap(struct timer_list *unused)
 {
 	int node;
 	int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx;
@@ -677,7 +677,7 @@ static void spu_evnt_swap(unsigned long data)
 
 static void start_spu_event_swap(void)
 {
-	setup_timer(&timer_spu_event_swap, spu_evnt_swap, 0UL);
+	timer_setup(&timer_spu_event_swap, spu_evnt_swap, 0);
 	timer_spu_event_swap.expires = jiffies + HZ / 25;
 	add_timer(&timer_spu_event_swap);
 }
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e47761cdcb98..9033c8194eda 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -992,13 +992,13 @@ static void spu_calc_load(void)
 	CALC_LOAD(spu_avenrun[2], EXP_15, active_tasks);
 }
 
-static void spusched_wake(unsigned long data)
+static void spusched_wake(struct timer_list *unused)
 {
 	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	wake_up_process(spusched_task);
 }
 
-static void spuloadavg_wake(unsigned long data)
+static void spuloadavg_wake(struct timer_list *unused)
 {
 	mod_timer(&spuloadavg_timer, jiffies + LOAD_FREQ);
 	spu_calc_load();
@@ -1124,8 +1124,8 @@ int __init spu_sched_init(void)
 	}
 	spin_lock_init(&spu_prio->runq_lock);
 
-	setup_timer(&spusched_timer, spusched_wake, 0);
-	setup_timer(&spuloadavg_timer, spuloadavg_wake, 0);
+	timer_setup(&spusched_timer, spusched_wake, 0);
+	timer_setup(&spuloadavg_timer, spuloadavg_wake, 0);
 
 	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
 	if (IS_ERR(spusched_task)) {
diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c
index 39a1d4225e0f..3408f315ef48 100644
--- a/arch/powerpc/platforms/powermac/low_i2c.c
+++ b/arch/powerpc/platforms/powermac/low_i2c.c
@@ -361,9 +361,9 @@ static irqreturn_t kw_i2c_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void kw_i2c_timeout(unsigned long data)
+static void kw_i2c_timeout(struct timer_list *t)
 {
-	struct pmac_i2c_host_kw *host = (struct pmac_i2c_host_kw *)data;
+	struct pmac_i2c_host_kw *host = from_timer(host, t, timeout_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&host->lock, flags);
@@ -513,7 +513,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np)
 	mutex_init(&host->mutex);
 	init_completion(&host->complete);
 	spin_lock_init(&host->lock);
-	setup_timer(&host->timeout_timer, kw_i2c_timeout, (unsigned long)host);
+	timer_setup(&host->timeout_timer, kw_i2c_timeout, 0);
 
 	psteps = of_get_property(np, "AAPL,address-step", NULL);
 	steps = psteps ? (*psteps) : 0x10;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 5cbd52169348..be6198193ec2 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -523,7 +523,7 @@ static void __init stp_reset(void)
 	}
 }
 
-static void stp_timeout(unsigned long dummy)
+static void stp_timeout(struct timer_list *unused)
 {
 	queue_work(time_sync_wq, &stp_work);
 }
@@ -532,7 +532,7 @@ static int __init stp_init(void)
 {
 	if (!test_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags))
 		return 0;
-	setup_timer(&stp_timer, stp_timeout, 0UL);
+	timer_setup(&stp_timer, stp_timeout, 0);
 	time_init_wq();
 	if (!stp_online)
 		return 0;
diff --git a/arch/sh/drivers/heartbeat.c b/arch/sh/drivers/heartbeat.c
index c6d96049a0bb..e8af2ff29bc3 100644
--- a/arch/sh/drivers/heartbeat.c
+++ b/arch/sh/drivers/heartbeat.c
@@ -59,9 +59,9 @@ static inline void heartbeat_toggle_bit(struct heartbeat_data *hd,
 	}
 }
 
-static void heartbeat_timer(unsigned long data)
+static void heartbeat_timer(struct timer_list *t)
 {
-	struct heartbeat_data *hd = (struct heartbeat_data *)data;
+	struct heartbeat_data *hd = from_timer(hd, t, timer);
 	static unsigned bit = 0, up = 1;
 
 	heartbeat_toggle_bit(hd, bit, hd->flags & HEARTBEAT_INVERTED);
@@ -133,7 +133,7 @@ static int heartbeat_drv_probe(struct platform_device *pdev)
 		}
 	}
 
-	setup_timer(&hd->timer, heartbeat_timer, (unsigned long)hd);
+	timer_setup(&hd->timer, heartbeat_timer, 0);
 	platform_set_drvdata(pdev, hd);
 
 	return mod_timer(&hd->timer, jiffies + 1);
diff --git a/arch/sh/drivers/pci/common.c b/arch/sh/drivers/pci/common.c
index 0d7eb7b5ac8d..fe163ecd0719 100644
--- a/arch/sh/drivers/pci/common.c
+++ b/arch/sh/drivers/pci/common.c
@@ -85,18 +85,18 @@ int __init pci_is_66mhz_capable(struct pci_channel *hose,
 	return cap66 > 0;
 }
 
-static void pcibios_enable_err(unsigned long __data)
+static void pcibios_enable_err(struct timer_list *t)
 {
-	struct pci_channel *hose = (struct pci_channel *)__data;
+	struct pci_channel *hose = from_timer(hose, t, err_timer);
 
 	del_timer(&hose->err_timer);
 	printk(KERN_DEBUG "PCI: re-enabling error IRQ.\n");
 	enable_irq(hose->err_irq);
 }
 
-static void pcibios_enable_serr(unsigned long __data)
+static void pcibios_enable_serr(struct timer_list *t)
 {
-	struct pci_channel *hose = (struct pci_channel *)__data;
+	struct pci_channel *hose = from_timer(hose, t, serr_timer);
 
 	del_timer(&hose->serr_timer);
 	printk(KERN_DEBUG "PCI: re-enabling system error IRQ.\n");
@@ -106,13 +106,11 @@ static void pcibios_enable_serr(unsigned long __data)
 void pcibios_enable_timers(struct pci_channel *hose)
 {
 	if (hose->err_irq) {
-		setup_timer(&hose->err_timer, pcibios_enable_err,
-			    (unsigned long)hose);
+		timer_setup(&hose->err_timer, pcibios_enable_err, 0);
 	}
 
 	if (hose->serr_irq) {
-		setup_timer(&hose->serr_timer, pcibios_enable_serr,
-			    (unsigned long)hose);
+		timer_setup(&hose->serr_timer, pcibios_enable_serr, 0);
 	}
 }
 
diff --git a/arch/sh/drivers/push-switch.c b/arch/sh/drivers/push-switch.c
index 2dc791507968..a17181160233 100644
--- a/arch/sh/drivers/push-switch.c
+++ b/arch/sh/drivers/push-switch.c
@@ -26,9 +26,9 @@ static ssize_t switch_show(struct device *dev,
 }
 static DEVICE_ATTR(switch, S_IRUGO, switch_show, NULL);
 
-static void switch_timer(unsigned long data)
+static void switch_timer(struct timer_list *t)
 {
-	struct push_switch *psw = (struct push_switch *)data;
+	struct push_switch *psw = from_timer(psw, t, debounce);
 
 	schedule_work(&psw->work);
 }
@@ -78,7 +78,7 @@ static int switch_drv_probe(struct platform_device *pdev)
 	}
 
 	INIT_WORK(&psw->work, switch_work_handler);
-	setup_timer(&psw->debounce, switch_timer, (unsigned long)psw);
+	timer_setup(&psw->debounce, switch_timer, 0);
 
 	/* Workqueue API brain-damage */
 	psw->pdev = pdev;
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 3a2f3c96f367..28003bf9941c 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -79,9 +79,9 @@ void blk_stat_add(struct request *rq)
 	rcu_read_unlock();
 }
 
-static void blk_stat_timer_fn(unsigned long data)
+static void blk_stat_timer_fn(struct timer_list *t)
 {
-	struct blk_stat_callback *cb = (void *)data;
+	struct blk_stat_callback *cb = from_timer(cb, t, timer);
 	unsigned int bucket;
 	int cpu;
 
@@ -130,7 +130,7 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
 	cb->bucket_fn = bucket_fn;
 	cb->data = data;
 	cb->buckets = buckets;
-	setup_timer(&cb->timer, blk_stat_timer_fn, (unsigned long)cb);
+	timer_setup(&cb->timer, blk_stat_timer_fn, 0);
 
 	return cb;
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 96ad32623427..825bc29767e6 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -225,7 +225,7 @@ struct throtl_data
 	bool track_bio_latency;
 };
 
-static void throtl_pending_timer_fn(unsigned long arg);
+static void throtl_pending_timer_fn(struct timer_list *t);
 
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
@@ -478,8 +478,7 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
 	INIT_LIST_HEAD(&sq->queued[0]);
 	INIT_LIST_HEAD(&sq->queued[1]);
 	sq->pending_tree = RB_ROOT;
-	setup_timer(&sq->pending_timer, throtl_pending_timer_fn,
-		    (unsigned long)sq);
+	timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
 }
 
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
@@ -1249,9 +1248,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
  * the top-level service_tree is reached, throtl_data->dispatch_work is
  * kicked so that the ready bio's are issued.
  */
-static void throtl_pending_timer_fn(unsigned long arg)
+static void throtl_pending_timer_fn(struct timer_list *t)
 {
-	struct throtl_service_queue *sq = (void *)arg;
+	struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
 	struct throtl_grp *tg = sq_to_tg(sq);
 	struct throtl_data *td = sq_to_td(sq);
 	struct request_queue *q = td->queue;
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index acf16c323e38..dd286ad404f8 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -293,7 +293,7 @@ static inline void __init show_version (void) {
   
 */
 
-static void do_housekeeping (unsigned long arg);
+static void do_housekeeping (struct timer_list *t);
 /********** globals **********/
 
 static unsigned short debug = 0;
@@ -1493,8 +1493,8 @@ static const struct atmdev_ops amb_ops = {
 };
 
 /********** housekeeping **********/
-static void do_housekeeping (unsigned long arg) {
-  amb_dev * dev = (amb_dev *) arg;
+static void do_housekeeping (struct timer_list *t) {
+  amb_dev * dev = from_timer(dev, t, housekeeping);
   
   // could collect device-specific (not driver/atm-linux) stats here
       
@@ -2267,8 +2267,7 @@ static int amb_probe(struct pci_dev *pci_dev,
 	dev->atm_dev->ci_range.vpi_bits = NUM_VPI_BITS;
 	dev->atm_dev->ci_range.vci_bits = NUM_VCI_BITS;
 
-	setup_timer(&dev->housekeeping, do_housekeeping,
-		    (unsigned long)dev);
+	timer_setup(&dev->housekeeping, do_housekeeping, 0);
 	mod_timer(&dev->housekeeping, jiffies);
 
 	// enable host interrupts
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 534001270be5..d97c05690faa 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -1656,9 +1656,9 @@ static irqreturn_t fs_irq (int irq, void *dev_id)
 
 
 #ifdef FS_POLL_FREQ
-static void fs_poll (unsigned long data)
+static void fs_poll (struct timer_list *t)
 {
-	struct fs_dev *dev = (struct fs_dev *) data;
+	struct fs_dev *dev = from_timer(dev, t, timer);
   
 	fs_irq (0, dev);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
@@ -1885,7 +1885,7 @@ static int fs_init(struct fs_dev *dev)
 	}
 
 #ifdef FS_POLL_FREQ
-	setup_timer (&dev->timer, fs_poll, (unsigned long)dev);
+	timer_setup(&dev->timer, fs_poll, 0);
 	dev->timer.expires = jiffies + FS_POLL_FREQ;
 	add_timer (&dev->timer);
 #endif
diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c
index e121b8485731..5ddc203206b8 100644
--- a/drivers/atm/horizon.c
+++ b/drivers/atm/horizon.c
@@ -357,7 +357,7 @@ static inline void __init show_version (void) {
 
 /********** globals **********/
 
-static void do_housekeeping (unsigned long arg);
+static void do_housekeeping (struct timer_list *t);
 
 static unsigned short debug = 0;
 static unsigned short vpi_bits = 0;
@@ -1418,9 +1418,9 @@ static irqreturn_t interrupt_handler(int irq, void *dev_id)
 
 /********** housekeeping **********/
 
-static void do_housekeeping (unsigned long arg) {
+static void do_housekeeping (struct timer_list *t) {
   // just stats at the moment
-  hrz_dev * dev = (hrz_dev *) arg;
+  hrz_dev * dev = from_timer(dev, t, housekeeping);
 
   // collect device-specific (not driver/atm-linux) stats here
   dev->tx_cell_count += rd_regw (dev, TX_CELL_COUNT_OFF);
@@ -2796,7 +2796,7 @@ static int hrz_probe(struct pci_dev *pci_dev,
 	dev->atm_dev->ci_range.vpi_bits = vpi_bits;
 	dev->atm_dev->ci_range.vci_bits = 10-vpi_bits;
 
-	setup_timer(&dev->housekeeping, do_housekeeping, (unsigned long) dev);
+	timer_setup(&dev->housekeeping, do_housekeeping, 0);
 	mod_timer(&dev->housekeeping, jiffies);
 
 out:
diff --git a/drivers/atm/idt77252.c b/drivers/atm/idt77252.c
index 0e3b9c44c808..0277f36be85b 100644
--- a/drivers/atm/idt77252.c
+++ b/drivers/atm/idt77252.c
@@ -1528,9 +1528,9 @@ idt77252_tx(struct idt77252_dev *card)
 
 
 static void
-tst_timer(unsigned long data)
+tst_timer(struct timer_list *t)
 {
-	struct idt77252_dev *card = (struct idt77252_dev *)data;
+	struct idt77252_dev *card = from_timer(card, t, tst_timer);
 	unsigned long base, idle, jump;
 	unsigned long flags;
 	u32 pc;
@@ -3634,7 +3634,7 @@ static int idt77252_init_one(struct pci_dev *pcidev,
 	spin_lock_init(&card->cmd_lock);
 	spin_lock_init(&card->tst_lock);
 
-	setup_timer(&card->tst_timer, tst_timer, (unsigned long)card);
+	timer_setup(&card->tst_timer, tst_timer, 0);
 
 	/* Do the I/O remapping... */
 	card->membase = ioremap(membase, 1024);
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 87e8b5dfac39..6664aa50789e 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1761,9 +1761,9 @@ static void iter_dequeue(struct lanai_dev *lanai, vci_t vci)
 }
 #endif /* !DEBUG_RW */
 
-static void lanai_timed_poll(unsigned long arg)
+static void lanai_timed_poll(struct timer_list *t)
 {
-	struct lanai_dev *lanai = (struct lanai_dev *) arg;
+	struct lanai_dev *lanai = from_timer(lanai, t, timer);
 #ifndef DEBUG_RW
 	unsigned long flags;
 #ifdef USE_POWERDOWN
@@ -1790,7 +1790,7 @@ static void lanai_timed_poll(unsigned long arg)
 
 static inline void lanai_timed_poll_start(struct lanai_dev *lanai)
 {
-	setup_timer(&lanai->timer, lanai_timed_poll, (unsigned long)lanai);
+	timer_setup(&lanai->timer, lanai_timed_poll, 0);
 	lanai->timer.expires = jiffies + LANAI_POLL_PERIOD;
 	add_timer(&lanai->timer);
 }
diff --git a/drivers/atm/nicstar.c b/drivers/atm/nicstar.c
index 335447ed0ba4..cbec9adc01c7 100644
--- a/drivers/atm/nicstar.c
+++ b/drivers/atm/nicstar.c
@@ -145,7 +145,7 @@ static int ns_ioctl(struct atm_dev *dev, unsigned int cmd, void __user * arg);
 #ifdef EXTRA_DEBUG
 static void which_list(ns_dev * card, struct sk_buff *skb);
 #endif
-static void ns_poll(unsigned long arg);
+static void ns_poll(struct timer_list *unused);
 static void ns_phy_put(struct atm_dev *dev, unsigned char value,
 		       unsigned long addr);
 static unsigned char ns_phy_get(struct atm_dev *dev, unsigned long addr);
@@ -284,7 +284,7 @@ static int __init nicstar_init(void)
 	XPRINTK("nicstar: nicstar_init() returned.\n");
 
 	if (!error) {
-		setup_timer(&ns_timer, ns_poll, 0UL);
+		timer_setup(&ns_timer, ns_poll, 0);
 		ns_timer.expires = jiffies + NS_POLL_PERIOD;
 		add_timer(&ns_timer);
 	}
@@ -2679,7 +2679,7 @@ static void which_list(ns_dev * card, struct sk_buff *skb)
 }
 #endif /* EXTRA_DEBUG */
 
-static void ns_poll(unsigned long arg)
+static void ns_poll(struct timer_list *unused)
 {
 	int i;
 	ns_dev *card;
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 6f14cdd6015b..442e777bdfb2 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3079,8 +3079,8 @@ DAC960_InitializeController(DAC960_Controller_T *Controller)
       /*
 	Initialize the Monitoring Timer.
       */
-      setup_timer(&Controller->MonitoringTimer,
-                  DAC960_MonitoringTimerFunction, (unsigned long)Controller);
+      timer_setup(&Controller->MonitoringTimer,
+                  DAC960_MonitoringTimerFunction, 0);
       Controller->MonitoringTimer.expires =
 	jiffies + DAC960_MonitoringTimerInterval;
       add_timer(&Controller->MonitoringTimer);
@@ -5619,9 +5619,9 @@ static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *Command)
   the status of DAC960 Controllers.
 */
 
-static void DAC960_MonitoringTimerFunction(unsigned long TimerData)
+static void DAC960_MonitoringTimerFunction(struct timer_list *t)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) TimerData;
+  DAC960_Controller_T *Controller = from_timer(Controller, t, MonitoringTimer);
   DAC960_Command_T *Command;
   unsigned long flags;
 
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
index 85fa9bb63759..6a6226a2b932 100644
--- a/drivers/block/DAC960.h
+++ b/drivers/block/DAC960.h
@@ -4406,7 +4406,7 @@ static irqreturn_t DAC960_PD_InterruptHandler(int, void *);
 static irqreturn_t DAC960_P_InterruptHandler(int, void *);
 static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *);
 static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *);
-static void DAC960_MonitoringTimerFunction(unsigned long);
+static void DAC960_MonitoringTimerFunction(struct timer_list *);
 static void DAC960_Message(DAC960_MessageLevel_T, unsigned char *,
 			   DAC960_Controller_T *, ...);
 static void DAC960_CreateProcEntries(DAC960_Controller_T *);
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 6a1b2177951c..beaccf197a5a 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -354,9 +354,9 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
 		rsxx_complete_dma(ctrl, dma, status);
 }
 
-static void dma_engine_stalled(unsigned long data)
+static void dma_engine_stalled(struct timer_list *t)
 {
-	struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+	struct rsxx_dma_ctrl *ctrl = from_timer(ctrl, t, activity_timer);
 	int cnt;
 
 	if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
@@ -838,8 +838,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
 	mutex_init(&ctrl->work_lock);
 	INIT_LIST_HEAD(&ctrl->queue);
 
-	setup_timer(&ctrl->activity_timer, dma_engine_stalled,
-					(unsigned long)ctrl);
+	timer_setup(&ctrl->activity_timer, dma_engine_stalled, 0);
 
 	ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0);
 	if (!ctrl->issue_wq)
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2819f23e8bf2..de0d08133c7e 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -707,9 +707,9 @@ static void skd_start_queue(struct work_struct *work)
 	blk_mq_start_hw_queues(skdev->queue);
 }
 
-static void skd_timer_tick(ulong arg)
+static void skd_timer_tick(struct timer_list *t)
 {
-	struct skd_device *skdev = (struct skd_device *)arg;
+	struct skd_device *skdev = from_timer(skdev, t, timer);
 	unsigned long reqflags;
 	u32 state;
 
@@ -857,7 +857,7 @@ static int skd_start_timer(struct skd_device *skdev)
 {
 	int rc;
 
-	setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
+	timer_setup(&skdev->timer, skd_timer_tick, 0);
 
 	rc = mod_timer(&skdev->timer, (jiffies + HZ));
 	if (rc)
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index ad9749463d4f..5ca56bfae63c 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -81,7 +81,7 @@ struct vdc_port {
 
 static void vdc_ldc_reset(struct vdc_port *port);
 static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(unsigned long _arg);
+static void vdc_ldc_reset_timer(struct timer_list *t);
 
 static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 {
@@ -974,8 +974,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	 */
 	ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
 	port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
-	setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer,
-		    (unsigned long)port);
+	timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0);
 	INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
 
 	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1087,9 +1086,9 @@ static void vdc_queue_drain(struct vdc_port *port)
 		__blk_end_request_all(req, BLK_STS_IOERR);
 }
 
-static void vdc_ldc_reset_timer(unsigned long _arg)
+static void vdc_ldc_reset_timer(struct timer_list *t)
 {
-	struct vdc_port *port = (struct vdc_port *) _arg;
+	struct vdc_port *port = from_timer(port, t, ldc_reset_timer);
 	struct vio_driver_state *vio = &port->vio;
 	unsigned long flags;
 
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index b4d4ccfe7582..8077123678ad 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -718,7 +718,7 @@ static void check_batteries(struct cardinfo *card)
 		set_fault_to_battery_status(card);
 }
 
-static void check_all_batteries(unsigned long ptr)
+static void check_all_batteries(struct timer_list *unused)
 {
 	int i;
 
@@ -738,7 +738,7 @@ static void check_all_batteries(unsigned long ptr)
 
 static void init_battery_timer(void)
 {
-	setup_timer(&battery_timer, check_all_batteries, 0UL);
+	timer_setup(&battery_timer, check_all_batteries, 0);
 	battery_timer.expires = jiffies + (HZ * 60);
 	add_timer(&battery_timer);
 }
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 14459d66ef0c..c24589414c75 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -770,9 +770,9 @@ static void ace_fsm_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&ace->lock, flags);
 }
 
-static void ace_stall_timer(unsigned long data)
+static void ace_stall_timer(struct timer_list *t)
 {
-	struct ace_device *ace = (void *)data;
+	struct ace_device *ace = from_timer(ace, t, stall_timer);
 	unsigned long flags;
 
 	dev_warn(ace->dev,
@@ -984,7 +984,7 @@ static int ace_setup(struct ace_device *ace)
 	 * Initialize the state machine tasklet and stall timer
 	 */
 	tasklet_init(&ace->fsm_tasklet, ace_fsm_tasklet, (unsigned long)ace);
-	setup_timer(&ace->stall_timer, ace_stall_timer, (unsigned long)ace);
+	timer_setup(&ace->stall_timer, ace_stall_timer, 0);
 
 	/*
 	 * Initialize the request queue
diff --git a/drivers/char/ipmi/bt-bmc.c b/drivers/char/ipmi/bt-bmc.c
index c4ef73c6f455..6edfaa72b98b 100644
--- a/drivers/char/ipmi/bt-bmc.c
+++ b/drivers/char/ipmi/bt-bmc.c
@@ -367,9 +367,9 @@ static const struct file_operations bt_bmc_fops = {
 	.unlocked_ioctl	= bt_bmc_ioctl,
 };
 
-static void poll_timer(unsigned long data)
+static void poll_timer(struct timer_list *t)
 {
-	struct bt_bmc *bt_bmc = (void *)data;
+	struct bt_bmc *bt_bmc = from_timer(bt_bmc, t, poll_timer);
 
 	bt_bmc->poll_timer.expires += msecs_to_jiffies(500);
 	wake_up(&bt_bmc->queue);
@@ -487,8 +487,7 @@ static int bt_bmc_probe(struct platform_device *pdev)
 		dev_info(dev, "Using IRQ %d\n", bt_bmc->irq);
 	} else {
 		dev_info(dev, "No IRQ; using timer\n");
-		setup_timer(&bt_bmc->poll_timer, poll_timer,
-			    (unsigned long)bt_bmc);
+		timer_setup(&bt_bmc->poll_timer, poll_timer, 0);
 		bt_bmc->poll_timer.expires = jiffies + msecs_to_jiffies(10);
 		add_timer(&bt_bmc->poll_timer);
 	}
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 9de189db2cc3..f45732a2cb3e 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -4766,7 +4766,7 @@ static struct timer_list ipmi_timer;
 
 static atomic_t stop_operation;
 
-static void ipmi_timeout(unsigned long data)
+static void ipmi_timeout(struct timer_list *unused)
 {
 	ipmi_smi_t intf;
 	int nt = 0;
@@ -5172,7 +5172,7 @@ static int ipmi_init_msghandler(void)
 
 #endif /* CONFIG_IPMI_PROC_INTERFACE */
 
-	setup_timer(&ipmi_timer, ipmi_timeout, 0);
+	timer_setup(&ipmi_timer, ipmi_timeout, 0);
 	mod_timer(&ipmi_timer, jiffies + IPMI_TIMEOUT_JIFFIES);
 
 	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 71d33a1807e4..779869ed32b1 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -1091,9 +1091,9 @@ static void set_need_watch(void *send_info, bool enable)
 	spin_unlock_irqrestore(&smi_info->si_lock, flags);
 }
 
-static void smi_timeout(unsigned long data)
+static void smi_timeout(struct timer_list *t)
 {
-	struct smi_info   *smi_info = (struct smi_info *) data;
+	struct smi_info   *smi_info = from_timer(smi_info, t, si_timer);
 	enum si_sm_result smi_result;
 	unsigned long     flags;
 	unsigned long     jiffies_now;
@@ -1166,7 +1166,7 @@ static int smi_start_processing(void       *send_info,
 	new_smi->intf = intf;
 
 	/* Set up the timer that drives the interface. */
-	setup_timer(&new_smi->si_timer, smi_timeout, (long)new_smi);
+	timer_setup(&new_smi->si_timer, smi_timeout, 0);
 	smi_mod_timer(new_smi, jiffies + SI_TIMEOUT_JIFFIES);
 
 	/* Try to claim any interrupts. */
diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c
index 466b3a1c0adf..3cfaec728604 100644
--- a/drivers/char/ipmi/ipmi_ssif.c
+++ b/drivers/char/ipmi/ipmi_ssif.c
@@ -551,9 +551,9 @@ static void start_get(struct ssif_info *ssif_info)
 	}
 }
 
-static void retry_timeout(unsigned long data)
+static void retry_timeout(struct timer_list *t)
 {
-	struct ssif_info *ssif_info = (void *) data;
+	struct ssif_info *ssif_info = from_timer(ssif_info, t, retry_timer);
 	unsigned long oflags, *flags;
 	bool waiting;
 
@@ -1691,8 +1691,7 @@ static int ssif_probe(struct i2c_client *client, const struct i2c_device_id *id)
 
 	spin_lock_init(&ssif_info->lock);
 	ssif_info->ssif_state = SSIF_NORMAL;
-	setup_timer(&ssif_info->retry_timer, retry_timeout,
-		    (unsigned long)ssif_info);
+	timer_setup(&ssif_info->retry_timer, retry_timeout, 0);
 
 	for (i = 0; i < SSIF_NUM_STATS; i++)
 		atomic_set(&ssif_info->stats[i], 0);
diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c
index 461bf0b8a094..230b99288024 100644
--- a/drivers/char/tpm/tpm-dev-common.c
+++ b/drivers/char/tpm/tpm-dev-common.c
@@ -22,9 +22,9 @@
 #include "tpm.h"
 #include "tpm-dev.h"
 
-static void user_reader_timeout(unsigned long ptr)
+static void user_reader_timeout(struct timer_list *t)
 {
-	struct file_priv *priv = (struct file_priv *)ptr;
+	struct file_priv *priv = from_timer(priv, t, user_read_timer);
 
 	pr_warn("TPM user space timeout is deprecated (pid=%d)\n",
 		task_tgid_nr(current));
@@ -48,8 +48,7 @@ void tpm_common_open(struct file *file, struct tpm_chip *chip,
 	priv->chip = chip;
 	atomic_set(&priv->data_pending, 0);
 	mutex_init(&priv->buffer_mutex);
-	setup_timer(&priv->user_read_timer, user_reader_timeout,
-			(unsigned long)priv);
+	timer_setup(&priv->user_read_timer, user_reader_timeout, 0);
 	INIT_WORK(&priv->work, timeout_work);
 
 	file->private_data = priv;
diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
index 09c1c4ff93ca..3717b3df34a4 100644
--- a/drivers/gpu/drm/drm_vblank.c
+++ b/drivers/gpu/drm/drm_vblank.c
@@ -367,9 +367,9 @@ void drm_vblank_disable_and_save(struct drm_device *dev, unsigned int pipe)
 	spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags);
 }
 
-static void vblank_disable_fn(unsigned long arg)
+static void vblank_disable_fn(struct timer_list *t)
 {
-	struct drm_vblank_crtc *vblank = (void *)arg;
+	struct drm_vblank_crtc *vblank = from_timer(vblank, t, disable_timer);
 	struct drm_device *dev = vblank->dev;
 	unsigned int pipe = vblank->pipe;
 	unsigned long irqflags;
@@ -436,8 +436,7 @@ int drm_vblank_init(struct drm_device *dev, unsigned int num_crtcs)
 		vblank->dev = dev;
 		vblank->pipe = i;
 		init_waitqueue_head(&vblank->queue);
-		setup_timer(&vblank->disable_timer, vblank_disable_fn,
-			    (unsigned long)vblank);
+		timer_setup(&vblank->disable_timer, vblank_disable_fn, 0);
 		seqlock_init(&vblank->seqlock);
 	}
 
@@ -1019,7 +1018,7 @@ static void drm_vblank_put(struct drm_device *dev, unsigned int pipe)
 		if (drm_vblank_offdelay == 0)
 			return;
 		else if (drm_vblank_offdelay < 0)
-			vblank_disable_fn((unsigned long)vblank);
+			vblank_disable_fn(&vblank->disable_timer);
 		else if (!dev->vblank_disable_immediate)
 			mod_timer(&vblank->disable_timer,
 				  jiffies + ((drm_vblank_offdelay * HZ)/1000));
@@ -1650,7 +1649,7 @@ bool drm_handle_vblank(struct drm_device *dev, unsigned int pipe)
 	spin_unlock_irqrestore(&dev->event_lock, irqflags);
 
 	if (disable_irq)
-		vblank_disable_fn((unsigned long)vblank);
+		vblank_disable_fn(&vblank->disable_timer);
 
 	return true;
 }
diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c
index 53e03f8af3d5..e6b0940b1ac2 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c
@@ -161,9 +161,9 @@ static const struct exynos_drm_crtc_ops vidi_crtc_ops = {
 	.atomic_flush = exynos_crtc_handle_event,
 };
 
-static void vidi_fake_vblank_timer(unsigned long arg)
+static void vidi_fake_vblank_timer(struct timer_list *t)
 {
-	struct vidi_context *ctx = (void *)arg;
+	struct vidi_context *ctx = from_timer(ctx, t, timer);
 
 	if (drm_crtc_handle_vblank(&ctx->crtc->base))
 		mod_timer(&ctx->timer,
@@ -449,7 +449,7 @@ static int vidi_probe(struct platform_device *pdev)
 
 	ctx->pdev = pdev;
 
-	setup_timer(&ctx->timer, vidi_fake_vblank_timer, (unsigned long)ctx);
+	timer_setup(&ctx->timer, vidi_fake_vblank_timer, 0);
 
 	mutex_init(&ctx->lock);
 
diff --git a/drivers/gpu/drm/i2c/tda998x_drv.c b/drivers/gpu/drm/i2c/tda998x_drv.c
index 4d1f45acf2cd..127815253a84 100644
--- a/drivers/gpu/drm/i2c/tda998x_drv.c
+++ b/drivers/gpu/drm/i2c/tda998x_drv.c
@@ -601,9 +601,9 @@ tda998x_reset(struct tda998x_priv *priv)
  * we have seen a HPD inactive->active transition.  This code implements
  * that delay.
  */
-static void tda998x_edid_delay_done(unsigned long data)
+static void tda998x_edid_delay_done(struct timer_list *t)
 {
-	struct tda998x_priv *priv = (struct tda998x_priv *)data;
+	struct tda998x_priv *priv = from_timer(priv, t, edid_delay_timer);
 
 	priv->edid_delay_active = false;
 	wake_up(&priv->edid_delay_waitq);
@@ -1492,8 +1492,7 @@ static int tda998x_create(struct i2c_client *client, struct tda998x_priv *priv)
 
 	mutex_init(&priv->mutex);	/* protect the page access */
 	init_waitqueue_head(&priv->edid_delay_waitq);
-	setup_timer(&priv->edid_delay_timer, tda998x_edid_delay_done,
-		    (unsigned long)priv);
+	timer_setup(&priv->edid_delay_timer, tda998x_edid_delay_done, 0);
 	INIT_WORK(&priv->detect_work, tda998x_detect_work);
 
 	/* wake up the device: */
diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index 40f4840ef98e..970c7963ae29 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -82,9 +82,9 @@ static struct msm_ringbuffer *get_next_ring(struct msm_gpu *gpu)
 	return NULL;
 }
 
-static void a5xx_preempt_timer(unsigned long data)
+static void a5xx_preempt_timer(struct timer_list *t)
 {
-	struct a5xx_gpu *a5xx_gpu = (struct a5xx_gpu *) data;
+	struct a5xx_gpu *a5xx_gpu = from_timer(a5xx_gpu, t, preempt_timer);
 	struct msm_gpu *gpu = &a5xx_gpu->base.base;
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
@@ -300,6 +300,5 @@ void a5xx_preempt_init(struct msm_gpu *gpu)
 		}
 	}
 
-	setup_timer(&a5xx_gpu->preempt_timer, a5xx_preempt_timer,
-		(unsigned long) a5xx_gpu);
+	timer_setup(&a5xx_gpu->preempt_timer, a5xx_preempt_timer, 0);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index 8d4477818ec2..232201403439 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -353,9 +353,9 @@ static void hangcheck_timer_reset(struct msm_gpu *gpu)
 			round_jiffies_up(jiffies + DRM_MSM_HANGCHECK_JIFFIES));
 }
 
-static void hangcheck_handler(unsigned long data)
+static void hangcheck_handler(struct timer_list *t)
 {
-	struct msm_gpu *gpu = (struct msm_gpu *)data;
+	struct msm_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
 	struct drm_device *dev = gpu->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
@@ -703,8 +703,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	INIT_WORK(&gpu->recover_work, recover_worker);
 
 
-	setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
-			(unsigned long)gpu);
+	timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
 
 	spin_lock_init(&gpu->perf_lock);
 
diff --git a/drivers/gpu/drm/omapdrm/dss/dsi.c b/drivers/gpu/drm/omapdrm/dss/dsi.c
index cea744e4d9bd..c2cf6d98e577 100644
--- a/drivers/gpu/drm/omapdrm/dss/dsi.c
+++ b/drivers/gpu/drm/omapdrm/dss/dsi.c
@@ -4095,7 +4095,7 @@ static void dsi_update_screen_dispc(struct platform_device *dsidev)
 }
 
 #ifdef DSI_CATCH_MISSING_TE
-static void dsi_te_timeout(unsigned long arg)
+static void dsi_te_timeout(struct timer_list *unused)
 {
 	DSSERR("TE not received for 250ms!\n");
 }
@@ -5449,7 +5449,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 			     dsi_framedone_timeout_work_callback);
 
 #ifdef DSI_CATCH_MISSING_TE
-	setup_timer(&dsi->te_timer, dsi_te_timeout, 0);
+	timer_setup(&dsi->te_timer, dsi_te_timeout, 0);
 #endif
 
 	dsi_mem = platform_get_resource_byname(dsidev, IORESOURCE_MEM, "proto");
diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_psr.c b/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
index a553e182ff53..3acfd576b7df 100644
--- a/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
+++ b/drivers/gpu/drm/rockchip/rockchip_drm_psr.c
@@ -101,9 +101,9 @@ static void psr_set_state(struct psr_drv *psr, enum psr_state state)
 	spin_unlock_irqrestore(&psr->lock, flags);
 }
 
-static void psr_flush_handler(unsigned long data)
+static void psr_flush_handler(struct timer_list *t)
 {
-	struct psr_drv *psr = (struct psr_drv *)data;
+	struct psr_drv *psr = from_timer(psr, t, flush_timer);
 	unsigned long flags;
 
 	/* If the state has changed since we initiated the flush, do nothing */
@@ -232,7 +232,7 @@ int rockchip_drm_psr_register(struct drm_encoder *encoder,
 	if (!psr)
 		return -ENOMEM;
 
-	setup_timer(&psr->flush_timer, psr_flush_handler, (unsigned long)psr);
+	timer_setup(&psr->flush_timer, psr_flush_handler, 0);
 	spin_lock_init(&psr->lock);
 
 	psr->active = true;
diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c
index 8fd52f211e9d..b28876c222b4 100644
--- a/drivers/gpu/drm/vgem/vgem_fence.c
+++ b/drivers/gpu/drm/vgem/vgem_fence.c
@@ -85,9 +85,9 @@ static const struct dma_fence_ops vgem_fence_ops = {
 	.timeline_value_str = vgem_fence_timeline_value_str,
 };
 
-static void vgem_fence_timeout(unsigned long data)
+static void vgem_fence_timeout(struct timer_list *t)
 {
-	struct vgem_fence *fence = (struct vgem_fence *)data;
+	struct vgem_fence *fence = from_timer(fence, t, timer);
 
 	dma_fence_signal(&fence->base);
 }
@@ -105,7 +105,7 @@ static struct dma_fence *vgem_fence_create(struct vgem_file *vfile,
 	dma_fence_init(&fence->base, &vgem_fence_ops, &fence->lock,
 		       dma_fence_context_alloc(1), 1);
 
-	setup_timer(&fence->timer, vgem_fence_timeout, (unsigned long)fence);
+	timer_setup(&fence->timer, vgem_fence_timeout, 0);
 
 	/* We force the fence to expire within 10s to prevent driver hangs */
 	mod_timer(&fence->timer, jiffies + VGEM_FENCE_TIMEOUT);
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index 32c9938e1e1e..d6e84a589ef1 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -452,9 +452,9 @@ via_dmablit_sync(struct drm_device *dev, uint32_t handle, int engine)
 
 
 static void
-via_dmablit_timer(unsigned long data)
+via_dmablit_timer(struct timer_list *t)
 {
-	drm_via_blitq_t *blitq = (drm_via_blitq_t *) data;
+	drm_via_blitq_t *blitq = from_timer(blitq, t, poll_timer);
 	struct drm_device *dev = blitq->dev;
 	int engine = (int)
 		(blitq - ((drm_via_private_t *)dev->dev_private)->blit_queues);
@@ -559,8 +559,7 @@ via_init_dmablit(struct drm_device *dev)
 			init_waitqueue_head(blitq->blit_queue + j);
 		init_waitqueue_head(&blitq->busy_queue);
 		INIT_WORK(&blitq->wq, via_dmablit_workqueue);
-		setup_timer(&blitq->poll_timer, via_dmablit_timer,
-				(unsigned long)blitq);
+		timer_setup(&blitq->poll_timer, via_dmablit_timer, 0);
 	}
 }
 
diff --git a/drivers/hid/hid-appleir.c b/drivers/hid/hid-appleir.c
index 07cbc70f00e7..eae7d52cf1a8 100644
--- a/drivers/hid/hid-appleir.c
+++ b/drivers/hid/hid-appleir.c
@@ -173,9 +173,9 @@ static void battery_flat(struct appleir *appleir)
 	dev_err(&appleir->input_dev->dev, "possible flat battery?\n");
 }
 
-static void key_up_tick(unsigned long data)
+static void key_up_tick(struct timer_list *t)
 {
-	struct appleir *appleir = (struct appleir *)data;
+	struct appleir *appleir = from_timer(appleir, t, key_up_timer);
 	struct hid_device *hid = appleir->hid;
 	unsigned long flags;
 
@@ -303,8 +303,7 @@ static int appleir_probe(struct hid_device *hid, const struct hid_device_id *id)
 	hid->quirks |= HID_QUIRK_HIDINPUT_FORCE;
 
 	spin_lock_init(&appleir->lock);
-	setup_timer(&appleir->key_up_timer,
-		    key_up_tick, (unsigned long) appleir);
+	timer_setup(&appleir->key_up_timer, key_up_tick, 0);
 
 	hid_set_drvdata(hid, appleir);
 
diff --git a/drivers/hid/hid-prodikeys.c b/drivers/hid/hid-prodikeys.c
index 49c4bd34b3c5..87eda34ea2f8 100644
--- a/drivers/hid/hid-prodikeys.c
+++ b/drivers/hid/hid-prodikeys.c
@@ -239,9 +239,9 @@ drop_note:
 	return;
 }
 
-static void pcmidi_sustained_note_release(unsigned long data)
+static void pcmidi_sustained_note_release(struct timer_list *t)
 {
-	struct pcmidi_sustain *pms = (struct pcmidi_sustain *)data;
+	struct pcmidi_sustain *pms = from_timer(pms, t, timer);
 
 	pcmidi_send_note(pms->pm, pms->status, pms->note, pms->velocity);
 	pms->in_use = 0;
@@ -256,8 +256,7 @@ static void init_sustain_timers(struct pcmidi_snd *pm)
 		pms = &pm->sustained_notes[i];
 		pms->in_use = 0;
 		pms->pm = pm;
-		setup_timer(&pms->timer, pcmidi_sustained_note_release,
-			(unsigned long)pms);
+		timer_setup(&pms->timer, pcmidi_sustained_note_release, 0);
 	}
 }
 
diff --git a/drivers/hid/hid-wiimote-core.c b/drivers/hid/hid-wiimote-core.c
index d00391418d1a..579884ebd94d 100644
--- a/drivers/hid/hid-wiimote-core.c
+++ b/drivers/hid/hid-wiimote-core.c
@@ -1226,9 +1226,9 @@ static void wiimote_schedule(struct wiimote_data *wdata)
 	spin_unlock_irqrestore(&wdata->state.lock, flags);
 }
 
-static void wiimote_init_timeout(unsigned long arg)
+static void wiimote_init_timeout(struct timer_list *t)
 {
-	struct wiimote_data *wdata = (void*)arg;
+	struct wiimote_data *wdata = from_timer(wdata, t, timer);
 
 	wiimote_schedule(wdata);
 }
@@ -1740,7 +1740,7 @@ static struct wiimote_data *wiimote_create(struct hid_device *hdev)
 	wdata->state.cmd_battery = 0xff;
 
 	INIT_WORK(&wdata->init_worker, wiimote_init_worker);
-	setup_timer(&wdata->timer, wiimote_init_timeout, (long)wdata);
+	timer_setup(&wdata->timer, wiimote_init_timeout, 0);
 
 	return wdata;
 }
diff --git a/drivers/iio/common/ssp_sensors/ssp_dev.c b/drivers/iio/common/ssp_sensors/ssp_dev.c
index ea7adb638d99..2ba2ff5e59c4 100644
--- a/drivers/iio/common/ssp_sensors/ssp_dev.c
+++ b/drivers/iio/common/ssp_sensors/ssp_dev.c
@@ -175,9 +175,9 @@ static void ssp_wdt_work_func(struct work_struct *work)
 	data->timeout_cnt = 0;
 }
 
-static void ssp_wdt_timer_func(unsigned long ptr)
+static void ssp_wdt_timer_func(struct timer_list *t)
 {
-	struct ssp_data *data = (struct ssp_data *)ptr;
+	struct ssp_data *data = from_timer(data, t, wdt_timer);
 
 	switch (data->fw_dl_state) {
 	case SSP_FW_DL_STATE_FAIL:
@@ -571,7 +571,7 @@ static int ssp_probe(struct spi_device *spi)
 	INIT_WORK(&data->work_wdt, ssp_wdt_work_func);
 	INIT_DELAYED_WORK(&data->work_refresh, ssp_refresh_task);
 
-	setup_timer(&data->wdt_timer, ssp_wdt_timer_func, (unsigned long)data);
+	timer_setup(&data->wdt_timer, ssp_wdt_timer_func, 0);
 
 	ret = request_threaded_irq(data->spi->irq, NULL,
 				   ssp_irq_thread_fn,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 9beee9cef137..ee0ee1f9994b 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -642,9 +642,9 @@ err:
 	return -ENOMEM;
 }
 
-static void delay_time_func(unsigned long ctx)
+static void delay_time_func(struct timer_list *t)
 {
-	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 
 	dev->fill_delay = 0;
 }
@@ -663,7 +663,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
 		return -ENOMEM;
 	}
 
-	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
+	timer_setup(&dev->delay_timer, delay_time_func, 0);
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
 		ent = &cache->ent[i];
 		INIT_LIST_HEAD(&ent->head);
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index cedc665364cd..73862a836062 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -202,9 +202,9 @@ void gameport_stop_polling(struct gameport *gameport)
 }
 EXPORT_SYMBOL(gameport_stop_polling);
 
-static void gameport_run_poll_handler(unsigned long d)
+static void gameport_run_poll_handler(struct timer_list *t)
 {
-	struct gameport *gameport = (struct gameport *)d;
+	struct gameport *gameport = from_timer(gameport, t, poll_timer);
 
 	gameport->poll_handler(gameport);
 	if (gameport->poll_cnt)
@@ -542,8 +542,7 @@ static void gameport_init_port(struct gameport *gameport)
 
 	INIT_LIST_HEAD(&gameport->node);
 	spin_lock_init(&gameport->timer_lock);
-	setup_timer(&gameport->poll_timer, gameport_run_poll_handler,
-		    (unsigned long)gameport);
+	timer_setup(&gameport->poll_timer, gameport_run_poll_handler, 0);
 }
 
 /*
diff --git a/drivers/input/joystick/db9.c b/drivers/input/joystick/db9.c
index f4ad83eab67f..de0dd4756c84 100644
--- a/drivers/input/joystick/db9.c
+++ b/drivers/input/joystick/db9.c
@@ -364,9 +364,9 @@ static int db9_saturn(int mode, struct parport *port, struct input_dev *devs[])
 	return 0;
 }
 
-static void db9_timer(unsigned long private)
+static void db9_timer(struct timer_list *t)
 {
-	struct db9 *db9 = (void *) private;
+	struct db9 *db9 = from_timer(db9, t, timer);
 	struct parport *port = db9->pd->port;
 	struct input_dev *dev = db9->dev[0];
 	struct input_dev *dev2 = db9->dev[1];
@@ -609,7 +609,7 @@ static void db9_attach(struct parport *pp)
 	db9->pd = pd;
 	db9->mode = mode;
 	db9->parportno = pp->number;
-	setup_timer(&db9->timer, db9_timer, (long)db9);
+	timer_setup(&db9->timer, db9_timer, 0);
 
 	for (i = 0; i < (min(db9_mode->n_pads, DB9_MAX_DEVICES)); i++) {
 
diff --git a/drivers/input/joystick/gamecon.c b/drivers/input/joystick/gamecon.c
index ca734ea97e53..2ffb2e8bdc3b 100644
--- a/drivers/input/joystick/gamecon.c
+++ b/drivers/input/joystick/gamecon.c
@@ -743,9 +743,9 @@ static void gc_psx_process_packet(struct gc *gc)
  * gc_timer() initiates reads of console pads data.
  */
 
-static void gc_timer(unsigned long private)
+static void gc_timer(struct timer_list *t)
 {
-	struct gc *gc = (void *) private;
+	struct gc *gc = from_timer(gc, t, timer);
 
 /*
  * N64 pads - must be read first, any read confuses them for 200 us
@@ -974,7 +974,7 @@ static void gc_attach(struct parport *pp)
 	mutex_init(&gc->mutex);
 	gc->pd = pd;
 	gc->parportno = pp->number;
-	setup_timer(&gc->timer, gc_timer, (long) gc);
+	timer_setup(&gc->timer, gc_timer, 0);
 
 	for (i = 0; i < n_pads && i < GC_MAX_DEVICES; i++) {
 		if (!pads[i])
diff --git a/drivers/input/joystick/turbografx.c b/drivers/input/joystick/turbografx.c
index a1fdc75a438d..e2685753e460 100644
--- a/drivers/input/joystick/turbografx.c
+++ b/drivers/input/joystick/turbografx.c
@@ -89,9 +89,9 @@ static struct tgfx {
  * tgfx_timer() reads and analyzes TurboGraFX joystick data.
  */
 
-static void tgfx_timer(unsigned long private)
+static void tgfx_timer(struct timer_list *t)
 {
-	struct tgfx *tgfx = (void *) private;
+	struct tgfx *tgfx = from_timer(tgfx, t, timer);
 	struct input_dev *dev;
 	int data1, data2, i;
 
@@ -200,7 +200,7 @@ static void tgfx_attach(struct parport *pp)
 	mutex_init(&tgfx->sem);
 	tgfx->pd = pd;
 	tgfx->parportno = pp->number;
-	setup_timer(&tgfx->timer, tgfx_timer, (long)tgfx);
+	timer_setup(&tgfx->timer, tgfx_timer, 0);
 
 	for (i = 0; i < n_devs; i++) {
 		if (n_buttons[i] < 1)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 466aaa8ba841..83fe2621effe 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -36,7 +36,7 @@ static unsigned long iova_rcache_get(struct iova_domain *iovad,
 static void init_iova_rcaches(struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
 static void fq_destroy_all_entries(struct iova_domain *iovad);
-static void fq_flush_timeout(unsigned long data);
+static void fq_flush_timeout(struct timer_list *t);
 
 void
 init_iova_domain(struct iova_domain *iovad, unsigned long granule,
@@ -107,7 +107,7 @@ int init_iova_flush_queue(struct iova_domain *iovad,
 		spin_lock_init(&fq->lock);
 	}
 
-	setup_timer(&iovad->fq_timer, fq_flush_timeout, (unsigned long)iovad);
+	timer_setup(&iovad->fq_timer, fq_flush_timeout, 0);
 	atomic_set(&iovad->fq_timer_on, 0);
 
 	return 0;
@@ -519,9 +519,9 @@ static void fq_destroy_all_entries(struct iova_domain *iovad)
 	}
 }
 
-static void fq_flush_timeout(unsigned long data)
+static void fq_flush_timeout(struct timer_list *t)
 {
-	struct iova_domain *iovad = (struct iova_domain *)data;
+	struct iova_domain *iovad = from_timer(iovad, t, fq_timer);
 	int cpu;
 
 	atomic_set(&iovad->fq_timer_on, 0);
diff --git a/drivers/isdn/capi/capidrv.c b/drivers/isdn/capi/capidrv.c
index 89dd1303a98a..49fef08858c5 100644
--- a/drivers/isdn/capi/capidrv.c
+++ b/drivers/isdn/capi/capidrv.c
@@ -2235,9 +2235,9 @@ static void send_listen(capidrv_contr *card)
 	send_message(card, &cmdcmsg);
 }
 
-static void listentimerfunc(unsigned long x)
+static void listentimerfunc(struct timer_list *t)
 {
-	capidrv_contr *card = (capidrv_contr *)x;
+	capidrv_contr *card = from_timer(card, t, listentimer);
 	if (card->state != ST_LISTEN_NONE && card->state != ST_LISTEN_ACTIVE)
 		printk(KERN_ERR "%s: controller dead ??\n", card->name);
 	send_listen(card);
@@ -2264,7 +2264,7 @@ static int capidrv_addcontr(u16 contr, struct capi_profile *profp)
 		return -1;
 	}
 	card->owner = THIS_MODULE;
-	setup_timer(&card->listentimer, listentimerfunc, (unsigned long)card);
+	timer_setup(&card->listentimer, listentimerfunc, 0);
 	strcpy(card->name, id);
 	card->contrnr = contr;
 	card->nbchan = profp->nbchannel;
diff --git a/drivers/isdn/divert/isdn_divert.c b/drivers/isdn/divert/isdn_divert.c
index 6f423bc49d0d..5620fd2c6009 100644
--- a/drivers/isdn/divert/isdn_divert.c
+++ b/drivers/isdn/divert/isdn_divert.c
@@ -55,10 +55,10 @@ DEFINE_SPINLOCK(divert_lock);
 /***************************/
 /* timer callback function */
 /***************************/
-static void deflect_timer_expire(ulong arg)
+static void deflect_timer_expire(struct timer_list *t)
 {
 	unsigned long flags;
-	struct call_struc *cs = (struct call_struc *) arg;
+	struct call_struc *cs = from_timer(cs, t, timer);
 
 	spin_lock_irqsave(&divert_lock, flags);
 	del_timer(&cs->timer); /* delete active timer */
@@ -157,7 +157,7 @@ int cf_command(int drvid, int mode,
 	/* allocate mem for information struct */
 	if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
 		return (-ENOMEM); /* no memory */
-	setup_timer(&cs->timer, deflect_timer_expire, (ulong)cs);
+	timer_setup(&cs->timer, deflect_timer_expire, 0);
 	cs->info[0] = '\0';
 	cs->ics.driver = drvid;
 	cs->ics.command = ISDN_CMD_PROT_IO; /* protocol specific io */
@@ -450,8 +450,7 @@ static int isdn_divert_icall(isdn_ctrl *ic)
 					return (0); /* no external deflection needed */
 			if (!(cs = kmalloc(sizeof(struct call_struc), GFP_ATOMIC)))
 				return (0); /* no memory */
-			setup_timer(&cs->timer, deflect_timer_expire,
-				    (ulong)cs);
+			timer_setup(&cs->timer, deflect_timer_expire, 0);
 			cs->info[0] = '\0';
 
 			cs->ics = *ic; /* copy incoming data */
diff --git a/drivers/isdn/hardware/eicon/divasi.c b/drivers/isdn/hardware/eicon/divasi.c
index c61049585cbd..0033d74a7291 100644
--- a/drivers/isdn/hardware/eicon/divasi.c
+++ b/drivers/isdn/hardware/eicon/divasi.c
@@ -78,7 +78,7 @@ static unsigned int um_idi_poll(struct file *file, poll_table *wait);
 static int um_idi_open(struct inode *inode, struct file *file);
 static int um_idi_release(struct inode *inode, struct file *file);
 static int remove_entity(void *entity);
-static void diva_um_timer_function(unsigned long data);
+static void diva_um_timer_function(struct timer_list *t);
 
 /*
  * proc entry
@@ -300,8 +300,7 @@ static int um_idi_open_adapter(struct file *file, int adapter_nr)
 	p_os = (diva_um_idi_os_context_t *) diva_um_id_get_os_context(e);
 	init_waitqueue_head(&p_os->read_wait);
 	init_waitqueue_head(&p_os->close_wait);
-	setup_timer(&p_os->diva_timer_id, (void *)diva_um_timer_function,
-		    (unsigned long)p_os);
+	timer_setup(&p_os->diva_timer_id, diva_um_timer_function, 0);
 	p_os->aborted = 0;
 	p_os->adapter_nr = adapter_nr;
 	return (1);
@@ -457,9 +456,9 @@ void diva_os_wakeup_close(void *os_context)
 }
 
 static
-void diva_um_timer_function(unsigned long data)
+void diva_um_timer_function(struct timer_list *t)
 {
-	diva_um_idi_os_context_t *p_os = (diva_um_idi_os_context_t *) data;
+	diva_um_idi_os_context_t *p_os = from_timer(p_os, t, diva_timer_id);
 
 	p_os->aborted = 1;
 	wake_up_interruptible(&p_os->read_wait);
diff --git a/drivers/isdn/hardware/mISDN/hfcmulti.c b/drivers/isdn/hardware/mISDN/hfcmulti.c
index 3cf07b8ced1c..4d85645c87f7 100644
--- a/drivers/isdn/hardware/mISDN/hfcmulti.c
+++ b/drivers/isdn/hardware/mISDN/hfcmulti.c
@@ -2855,7 +2855,7 @@ irq_notforus:
  */
 
 static void
-hfcmulti_dbusy_timer(struct hfc_multi *hc)
+hfcmulti_dbusy_timer(struct timer_list *t)
 {
 }
 
@@ -3877,8 +3877,7 @@ hfcmulti_initmode(struct dchannel *dch)
 		if (hc->dnum[pt]) {
 			mode_hfcmulti(hc, dch->slot, dch->dev.D.protocol,
 				      -1, 0, -1, 0);
-			setup_timer(&dch->timer, (void *)hfcmulti_dbusy_timer,
-				    (long)dch);
+			timer_setup(&dch->timer, hfcmulti_dbusy_timer, 0);
 		}
 		for (i = 1; i <= 31; i++) {
 			if (!((1 << i) & hc->bmask[pt])) /* skip unused chan */
@@ -3984,8 +3983,7 @@ hfcmulti_initmode(struct dchannel *dch)
 		hc->chan[i].slot_rx = -1;
 		hc->chan[i].conf = -1;
 		mode_hfcmulti(hc, i, dch->dev.D.protocol, -1, 0, -1, 0);
-		setup_timer(&dch->timer, (void *)hfcmulti_dbusy_timer,
-			    (long)dch);
+		timer_setup(&dch->timer, hfcmulti_dbusy_timer, 0);
 		hc->chan[i - 2].slot_tx = -1;
 		hc->chan[i - 2].slot_rx = -1;
 		hc->chan[i - 2].conf = -1;
diff --git a/drivers/isdn/hardware/mISDN/hfcpci.c b/drivers/isdn/hardware/mISDN/hfcpci.c
index e4ebbee863a1..ba3fe14bbe00 100644
--- a/drivers/isdn/hardware/mISDN/hfcpci.c
+++ b/drivers/isdn/hardware/mISDN/hfcpci.c
@@ -1241,7 +1241,7 @@ hfcpci_int(int intno, void *dev_id)
  * timer callback for D-chan busy resolution. Currently no function
  */
 static void
-hfcpci_dbusy_timer(struct hfc_pci *hc)
+hfcpci_dbusy_timer(struct timer_list *t)
 {
 }
 
@@ -1717,8 +1717,7 @@ static void
 inithfcpci(struct hfc_pci *hc)
 {
 	printk(KERN_DEBUG "inithfcpci: entered\n");
-	setup_timer(&hc->dch.timer, (void *)hfcpci_dbusy_timer,
-		    (long)&hc->dch);
+	timer_setup(&hc->dch.timer, hfcpci_dbusy_timer, 0);
 	hc->chanlimit = 2;
 	mode_hfcpci(&hc->bch[0], 1, -1);
 	mode_hfcpci(&hc->bch[1], 2, -1);
diff --git a/drivers/isdn/hardware/mISDN/mISDNisar.c b/drivers/isdn/hardware/mISDN/mISDNisar.c
index 5b078591b6ee..b791688d0228 100644
--- a/drivers/isdn/hardware/mISDN/mISDNisar.c
+++ b/drivers/isdn/hardware/mISDN/mISDNisar.c
@@ -1146,9 +1146,9 @@ mISDNisar_irq(struct isar_hw *isar)
 EXPORT_SYMBOL(mISDNisar_irq);
 
 static void
-ftimer_handler(unsigned long data)
+ftimer_handler(struct timer_list *t)
 {
-	struct isar_ch *ch = (struct isar_ch *)data;
+	struct isar_ch *ch = from_timer(ch, t, ftimer);
 
 	pr_debug("%s: ftimer flags %lx\n", ch->is->name, ch->bch.Flags);
 	test_and_clear_bit(FLG_FTI_RUN, &ch->bch.Flags);
@@ -1635,11 +1635,9 @@ init_isar(struct isar_hw *isar)
 	}
 	if (isar->version != 1)
 		return -EINVAL;
-	setup_timer(&isar->ch[0].ftimer, &ftimer_handler,
-		    (long)&isar->ch[0]);
+	timer_setup(&isar->ch[0].ftimer, ftimer_handler, 0);
 	test_and_set_bit(FLG_INITIALIZED, &isar->ch[0].bch.Flags);
-	setup_timer(&isar->ch[1].ftimer, &ftimer_handler,
-		    (long)&isar->ch[1]);
+	timer_setup(&isar->ch[1].ftimer, ftimer_handler, 0);
 	test_and_set_bit(FLG_INITIALIZED, &isar->ch[1].bch.Flags);
 	return 0;
 }
diff --git a/drivers/isdn/i4l/isdn_common.c b/drivers/isdn/i4l/isdn_common.c
index 3fa2f7b31131..8b03d618185e 100644
--- a/drivers/isdn/i4l/isdn_common.c
+++ b/drivers/isdn/i4l/isdn_common.c
@@ -231,7 +231,7 @@ static int isdn_timer_cnt2 = 0;
 static int isdn_timer_cnt3 = 0;
 
 static void
-isdn_timer_funct(ulong dummy)
+isdn_timer_funct(struct timer_list *unused)
 {
 	int tf = dev->tflags;
 	if (tf & ISDN_TIMER_FAST) {
@@ -2294,7 +2294,7 @@ static int __init isdn_init(void)
 		printk(KERN_WARNING "isdn: Could not allocate device-struct.\n");
 		return -EIO;
 	}
-	setup_timer(&dev->timer, isdn_timer_funct, 0UL);
+	timer_setup(&dev->timer, isdn_timer_funct, 0);
 	spin_lock_init(&dev->lock);
 	spin_lock_init(&dev->timerlock);
 #ifdef MODULE
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index 59d40160cab2..c138f66f2659 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1509,9 +1509,9 @@ static int isdn_net_ioctl(struct net_device *dev,
 
 /* called via cisco_timer.function */
 static void
-isdn_net_ciscohdlck_slarp_send_keepalive(unsigned long data)
+isdn_net_ciscohdlck_slarp_send_keepalive(struct timer_list *t)
 {
-	isdn_net_local *lp = (isdn_net_local *) data;
+	isdn_net_local *lp = from_timer(lp, t, cisco_timer);
 	struct sk_buff *skb;
 	unsigned char *p;
 	unsigned long last_cisco_myseq = lp->cisco_myseq;
@@ -1615,9 +1615,8 @@ isdn_net_ciscohdlck_connected(isdn_net_local *lp)
 	/* send slarp request because interface/seq.no.s reset */
 	isdn_net_ciscohdlck_slarp_send_request(lp);
 
-	setup_timer(&lp->cisco_timer,
-		    isdn_net_ciscohdlck_slarp_send_keepalive,
-		    (unsigned long)lp);
+	timer_setup(&lp->cisco_timer,
+		    isdn_net_ciscohdlck_slarp_send_keepalive, 0);
 	lp->cisco_timer.expires = jiffies + lp->cisco_keepalive_period * HZ;
 	add_timer(&lp->cisco_timer);
 }
diff --git a/drivers/isdn/i4l/isdn_ppp.c b/drivers/isdn/i4l/isdn_ppp.c
index cd2b3c69771a..e07aefb9151d 100644
--- a/drivers/isdn/i4l/isdn_ppp.c
+++ b/drivers/isdn/i4l/isdn_ppp.c
@@ -50,7 +50,7 @@ static struct ippp_ccp_reset *isdn_ppp_ccp_reset_alloc(struct ippp_struct *is);
 static void isdn_ppp_ccp_reset_free(struct ippp_struct *is);
 static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
 					  unsigned char id);
-static void isdn_ppp_ccp_timer_callback(unsigned long closure);
+static void isdn_ppp_ccp_timer_callback(struct timer_list *t);
 static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_struct *is,
 								   unsigned char id);
 static void isdn_ppp_ccp_reset_trans(struct ippp_struct *is,
@@ -2327,10 +2327,10 @@ static void isdn_ppp_ccp_reset_free_state(struct ippp_struct *is,
 
 /* The timer callback function which is called when a ResetReq has timed out,
    aka has never been answered by a ResetAck */
-static void isdn_ppp_ccp_timer_callback(unsigned long closure)
+static void isdn_ppp_ccp_timer_callback(struct timer_list *t)
 {
 	struct ippp_ccp_reset_state *rs =
-		(struct ippp_ccp_reset_state *)closure;
+		from_timer(rs, t, timer);
 
 	if (!rs) {
 		printk(KERN_ERR "ippp_ccp: timer cb with zero closure.\n");
@@ -2376,8 +2376,7 @@ static struct ippp_ccp_reset_state *isdn_ppp_ccp_reset_alloc_state(struct ippp_s
 		rs->state = CCPResetIdle;
 		rs->is = is;
 		rs->id = id;
-		setup_timer(&rs->timer, isdn_ppp_ccp_timer_callback,
-			    (unsigned long)rs);
+		timer_setup(&rs->timer, isdn_ppp_ccp_timer_callback, 0);
 		is->reset->rs[id] = rs;
 	}
 	return rs;
diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c
index d30130c8d0f3..960f26348bb5 100644
--- a/drivers/isdn/i4l/isdn_tty.c
+++ b/drivers/isdn/i4l/isdn_tty.c
@@ -541,9 +541,9 @@ isdn_tty_senddown(modem_info *info)
  * into the tty's buffer.
  */
 static void
-isdn_tty_modem_do_ncarrier(unsigned long data)
+isdn_tty_modem_do_ncarrier(struct timer_list *t)
 {
-	modem_info *info = (modem_info *) data;
+	modem_info *info = from_timer(info, t, nc_timer);
 	isdn_tty_modem_result(RESULT_NO_CARRIER, info);
 }
 
@@ -1812,8 +1812,7 @@ isdn_tty_modem_init(void)
 		info->isdn_channel = -1;
 		info->drv_index = -1;
 		info->xmit_size = ISDN_SERIAL_XMIT_SIZE;
-		setup_timer(&info->nc_timer, isdn_tty_modem_do_ncarrier,
-			    (unsigned long)info);
+		timer_setup(&info->nc_timer, isdn_tty_modem_do_ncarrier, 0);
 		skb_queue_head_init(&info->xmit_queue);
 #ifdef CONFIG_ISDN_AUDIO
 		skb_queue_head_init(&info->dtmf_queue);
diff --git a/drivers/media/platform/s5p-mfc/s5p_mfc.c b/drivers/media/platform/s5p-mfc/s5p_mfc.c
index e179b33d3775..bc68dbbcaec1 100644
--- a/drivers/media/platform/s5p-mfc/s5p_mfc.c
+++ b/drivers/media/platform/s5p-mfc/s5p_mfc.c
@@ -145,9 +145,9 @@ void s5p_mfc_cleanup_queue(struct list_head *lh, struct vb2_queue *vq)
 	}
 }
 
-static void s5p_mfc_watchdog(unsigned long arg)
+static void s5p_mfc_watchdog(struct timer_list *t)
 {
-	struct s5p_mfc_dev *dev = (struct s5p_mfc_dev *)arg;
+	struct s5p_mfc_dev *dev = from_timer(dev, t, watchdog_timer);
 
 	if (test_bit(0, &dev->hw_lock))
 		atomic_inc(&dev->watchdog_cnt);
@@ -1314,8 +1314,7 @@ static int s5p_mfc_probe(struct platform_device *pdev)
 	dev->hw_lock = 0;
 	INIT_WORK(&dev->watchdog_work, s5p_mfc_watchdog_worker);
 	atomic_set(&dev->watchdog_cnt, 0);
-	setup_timer(&dev->watchdog_timer, s5p_mfc_watchdog,
-		    (unsigned long)dev);
+	timer_setup(&dev->watchdog_timer, s5p_mfc_watchdog, 0);
 
 	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
 	if (ret)
diff --git a/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c b/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
index 59280ac31937..a0acee7671b1 100644
--- a/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
+++ b/drivers/media/platform/sti/c8sectpfe/c8sectpfe-core.c
@@ -61,9 +61,9 @@ static int load_c8sectpfe_fw(struct c8sectpfei *fei);
 
 #define FIFO_LEN 1024
 
-static void c8sectpfe_timer_interrupt(unsigned long ac8sectpfei)
+static void c8sectpfe_timer_interrupt(struct timer_list *t)
 {
-	struct c8sectpfei *fei = (struct c8sectpfei *)ac8sectpfei;
+	struct c8sectpfei *fei = from_timer(fei, t, timer);
 	struct channel_info *channel;
 	int chan_num;
 
@@ -865,8 +865,7 @@ static int c8sectpfe_probe(struct platform_device *pdev)
 	}
 
 	/* Setup timer interrupt */
-	setup_timer(&fei->timer, c8sectpfe_timer_interrupt,
-		    (unsigned long)fei);
+	timer_setup(&fei->timer, c8sectpfe_timer_interrupt, 0);
 
 	mutex_init(&fei->lock);
 
diff --git a/drivers/media/platform/vim2m.c b/drivers/media/platform/vim2m.c
index b01fba020d5f..7bf9fa2f8534 100644
--- a/drivers/media/platform/vim2m.c
+++ b/drivers/media/platform/vim2m.c
@@ -388,9 +388,9 @@ static void device_run(void *priv)
 	schedule_irq(dev, ctx->transtime);
 }
 
-static void device_isr(unsigned long priv)
+static void device_isr(struct timer_list *t)
 {
-	struct vim2m_dev *vim2m_dev = (struct vim2m_dev *)priv;
+	struct vim2m_dev *vim2m_dev = from_timer(vim2m_dev, t, timer);
 	struct vim2m_ctx *curr_ctx;
 	struct vb2_v4l2_buffer *src_vb, *dst_vb;
 	unsigned long flags;
@@ -1024,7 +1024,7 @@ static int vim2m_probe(struct platform_device *pdev)
 	v4l2_info(&dev->v4l2_dev,
 			"Device registered as /dev/video%d\n", vfd->num);
 
-	setup_timer(&dev->timer, device_isr, (long)dev);
+	timer_setup(&dev->timer, device_isr, 0);
 	platform_set_drvdata(pdev, dev);
 
 	dev->m2m_dev = v4l2_m2m_init(&m2m_ops);
diff --git a/drivers/media/usb/au0828/au0828-dvb.c b/drivers/media/usb/au0828/au0828-dvb.c
index d701c04b3783..d9093a3c57c5 100644
--- a/drivers/media/usb/au0828/au0828-dvb.c
+++ b/drivers/media/usb/au0828/au0828-dvb.c
@@ -105,9 +105,9 @@ static struct tda18271_config hauppauge_woodbury_tunerconfig = {
 
 static void au0828_restart_dvb_streaming(struct work_struct *work);
 
-static void au0828_bulk_timeout(unsigned long data)
+static void au0828_bulk_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, bulk_timeout);
 
 	dprintk(1, "%s called\n", __func__);
 	dev->bulk_timeout_running = 0;
@@ -648,8 +648,7 @@ int au0828_dvb_register(struct au0828_dev *dev)
 		return ret;
 	}
 
-	setup_timer(&dev->bulk_timeout, au0828_bulk_timeout,
-		    (unsigned long)dev);
+	timer_setup(&dev->bulk_timeout, au0828_bulk_timeout, 0);
 
 	return 0;
 }
diff --git a/drivers/media/usb/au0828/au0828-video.c b/drivers/media/usb/au0828/au0828-video.c
index 654f67c25863..a240153821e0 100644
--- a/drivers/media/usb/au0828/au0828-video.c
+++ b/drivers/media/usb/au0828/au0828-video.c
@@ -954,9 +954,9 @@ int au0828_analog_unregister(struct au0828_dev *dev)
 /* This function ensures that video frames continue to be delivered even if
    the ITU-656 input isn't receiving any data (thereby preventing applications
    such as tvtime from hanging) */
-static void au0828_vid_buffer_timeout(unsigned long data)
+static void au0828_vid_buffer_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, vid_timeout);
 	struct au0828_dmaqueue *dma_q = &dev->vidq;
 	struct au0828_buffer *buf;
 	unsigned char *vid_data;
@@ -978,9 +978,9 @@ static void au0828_vid_buffer_timeout(unsigned long data)
 	spin_unlock_irqrestore(&dev->slock, flags);
 }
 
-static void au0828_vbi_buffer_timeout(unsigned long data)
+static void au0828_vbi_buffer_timeout(struct timer_list *t)
 {
-	struct au0828_dev *dev = (struct au0828_dev *) data;
+	struct au0828_dev *dev = from_timer(dev, t, vbi_timeout);
 	struct au0828_dmaqueue *dma_q = &dev->vbiq;
 	struct au0828_buffer *buf;
 	unsigned char *vbi_data;
@@ -1953,10 +1953,8 @@ int au0828_analog_register(struct au0828_dev *dev,
 	INIT_LIST_HEAD(&dev->vidq.active);
 	INIT_LIST_HEAD(&dev->vbiq.active);
 
-	setup_timer(&dev->vid_timeout, au0828_vid_buffer_timeout,
-		    (unsigned long)dev);
-	setup_timer(&dev->vbi_timeout, au0828_vbi_buffer_timeout,
-		    (unsigned long)dev);
+	timer_setup(&dev->vid_timeout, au0828_vid_buffer_timeout, 0);
+	timer_setup(&dev->vbi_timeout, au0828_vbi_buffer_timeout, 0);
 
 	dev->width = NTSC_STD_W;
 	dev->height = NTSC_STD_H;
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 22de7f5ed032..57b13dfbd21e 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -1492,9 +1492,9 @@ static int msb_ftl_scan(struct msb_data *msb)
 	return 0;
 }
 
-static void msb_cache_flush_timer(unsigned long data)
+static void msb_cache_flush_timer(struct timer_list *t)
 {
-	struct msb_data *msb = (struct msb_data *)data;
+	struct msb_data *msb = from_timer(msb, t, cache_flush_timer);
 	msb->need_flush_cache = true;
 	queue_work(msb->io_queue, &msb->io_work);
 }
@@ -1514,8 +1514,7 @@ static void msb_cache_discard(struct msb_data *msb)
 
 static int msb_cache_init(struct msb_data *msb)
 {
-	setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer,
-		(unsigned long)msb);
+	timer_setup(&msb->cache_flush_timer, msb_cache_flush_timer, 0);
 
 	if (!msb->cache)
 		msb->cache = kzalloc(msb->block_size, GFP_KERNEL);
diff --git a/drivers/mfd/rtsx_usb.c b/drivers/mfd/rtsx_usb.c
index 691dab791f7a..59d61b04c197 100644
--- a/drivers/mfd/rtsx_usb.c
+++ b/drivers/mfd/rtsx_usb.c
@@ -40,9 +40,9 @@ static const struct mfd_cell rtsx_usb_cells[] = {
 	},
 };
 
-static void rtsx_usb_sg_timed_out(unsigned long data)
+static void rtsx_usb_sg_timed_out(struct timer_list *t)
 {
-	struct rtsx_ucr *ucr = (struct rtsx_ucr *)data;
+	struct rtsx_ucr *ucr = from_timer(ucr, t, sg_timer);
 
 	dev_dbg(&ucr->pusb_intf->dev, "%s: sg transfer timed out", __func__);
 	usb_sg_cancel(&ucr->current_sg);
@@ -663,7 +663,7 @@ static int rtsx_usb_probe(struct usb_interface *intf,
 		goto out_init_fail;
 
 	/* initialize USB SG transfer timer */
-	setup_timer(&ucr->sg_timer, rtsx_usb_sg_timed_out, (unsigned long) ucr);
+	timer_setup(&ucr->sg_timer, rtsx_usb_sg_timed_out, 0);
 
 	ret = mfd_add_hotplug_devices(&intf->dev, rtsx_usb_cells,
 				      ARRAY_SIZE(rtsx_usb_cells));
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 35a9e4fd1a9f..64b03d6eaf18 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -160,9 +160,9 @@ out:
 	return err;
 }
 
-static void mmc_retune_timer(unsigned long data)
+static void mmc_retune_timer(struct timer_list *t)
 {
-	struct mmc_host *host = (struct mmc_host *)data;
+	struct mmc_host *host = from_timer(host, t, retune_timer);
 
 	mmc_retune_needed(host);
 }
@@ -389,7 +389,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev)
 	init_waitqueue_head(&host->wq);
 	INIT_DELAYED_WORK(&host->detect, mmc_rescan);
 	INIT_DELAYED_WORK(&host->sdio_irq_work, sdio_irq_work);
-	setup_timer(&host->retune_timer, mmc_retune_timer, (unsigned long)host);
+	timer_setup(&host->retune_timer, mmc_retune_timer, 0);
 
 	/*
 	 * By default, hosts do not support SGIO or large requests.
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 3692dd547879..4237c7cebf02 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -989,9 +989,9 @@ restart:
 
 
 /* flush timer, runs a second after last write */
-static void sm_cache_flush_timer(unsigned long data)
+static void sm_cache_flush_timer(struct timer_list *t)
 {
-	struct sm_ftl *ftl = (struct sm_ftl *)data;
+	struct sm_ftl *ftl = from_timer(ftl, t, timer);
 	queue_work(cache_flush_workqueue, &ftl->flush_work);
 }
 
@@ -1139,7 +1139,7 @@ static void sm_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd)
 
 
 	mutex_init(&ftl->mutex);
-	setup_timer(&ftl->timer, sm_cache_flush_timer, (unsigned long)ftl);
+	timer_setup(&ftl->timer, sm_cache_flush_timer, 0);
 	INIT_WORK(&ftl->flush_work, sm_cache_flush_work);
 	init_completion(&ftl->erase_completion);
 
diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c
index fed75e75207a..b8029ea03307 100644
--- a/drivers/net/caif/caif_hsi.c
+++ b/drivers/net/caif/caif_hsi.c
@@ -66,9 +66,9 @@ static const struct cfhsi_config  hsi_default_config = {
 
 static LIST_HEAD(cfhsi_list);
 
-static void cfhsi_inactivity_tout(unsigned long arg)
+static void cfhsi_inactivity_tout(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, inactivity_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -737,9 +737,9 @@ out_of_sync:
 	schedule_work(&cfhsi->out_of_sync_work);
 }
 
-static void cfhsi_rx_slowpath(unsigned long arg)
+static void cfhsi_rx_slowpath(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, rx_slowpath_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -997,9 +997,9 @@ static void cfhsi_wake_down_cb(struct cfhsi_cb_ops *cb_ops)
 	wake_up_interruptible(&cfhsi->wake_down_wait);
 }
 
-static void cfhsi_aggregation_tout(unsigned long arg)
+static void cfhsi_aggregation_tout(struct timer_list *t)
 {
-	struct cfhsi *cfhsi = (struct cfhsi *)arg;
+	struct cfhsi *cfhsi = from_timer(cfhsi, t, aggregation_timer);
 
 	netdev_dbg(cfhsi->ndev, "%s.\n",
 		__func__);
@@ -1211,14 +1211,11 @@ static int cfhsi_open(struct net_device *ndev)
 	init_waitqueue_head(&cfhsi->flush_fifo_wait);
 
 	/* Setup the inactivity timer. */
-	setup_timer(&cfhsi->inactivity_timer, cfhsi_inactivity_tout,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->inactivity_timer, cfhsi_inactivity_tout, 0);
 	/* Setup the slowpath RX timer. */
-	setup_timer(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->rx_slowpath_timer, cfhsi_rx_slowpath, 0);
 	/* Setup the aggregation timer. */
-	setup_timer(&cfhsi->aggregation_timer, cfhsi_aggregation_tout,
-		    (unsigned long)cfhsi);
+	timer_setup(&cfhsi->aggregation_timer, cfhsi_aggregation_tout, 0);
 
 	/* Activate HSI interface. */
 	res = cfhsi->ops->cfhsi_up(cfhsi->ops);
diff --git a/drivers/net/dsa/mv88e6xxx/phy.c b/drivers/net/dsa/mv88e6xxx/phy.c
index 436668bd50dc..46af8052e535 100644
--- a/drivers/net/dsa/mv88e6xxx/phy.c
+++ b/drivers/net/dsa/mv88e6xxx/phy.c
@@ -149,9 +149,9 @@ static void mv88e6xxx_phy_ppu_reenable_work(struct work_struct *ugly)
 	mutex_unlock(&chip->reg_lock);
 }
 
-static void mv88e6xxx_phy_ppu_reenable_timer(unsigned long _ps)
+static void mv88e6xxx_phy_ppu_reenable_timer(struct timer_list *t)
 {
-	struct mv88e6xxx_chip *chip = (void *)_ps;
+	struct mv88e6xxx_chip *chip = from_timer(chip, t, ppu_timer);
 
 	schedule_work(&chip->ppu_work);
 }
@@ -193,8 +193,7 @@ static void mv88e6xxx_phy_ppu_state_init(struct mv88e6xxx_chip *chip)
 {
 	mutex_init(&chip->ppu_mutex);
 	INIT_WORK(&chip->ppu_work, mv88e6xxx_phy_ppu_reenable_work);
-	setup_timer(&chip->ppu_timer, mv88e6xxx_phy_ppu_reenable_timer,
-		    (unsigned long)chip);
+	timer_setup(&chip->ppu_timer, mv88e6xxx_phy_ppu_reenable_timer, 0);
 }
 
 static void mv88e6xxx_phy_ppu_state_destroy(struct mv88e6xxx_chip *chip)
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index fccce4b47778..74263f8efe1a 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -139,9 +139,9 @@ static netdev_tx_t eql_slave_xmit(struct sk_buff *skb, struct net_device *dev);
 
 static void eql_kill_one_slave(slave_queue_t *queue, slave_t *slave);
 
-static void eql_timer(unsigned long param)
+static void eql_timer(struct timer_list *t)
 {
-	equalizer_t *eql = (equalizer_t *) param;
+	equalizer_t *eql = from_timer(eql, t, timer);
 	struct list_head *this, *tmp, *head;
 
 	spin_lock(&eql->queue.lock);
@@ -178,7 +178,7 @@ static void __init eql_setup(struct net_device *dev)
 {
 	equalizer_t *eql = netdev_priv(dev);
 
-	setup_timer(&eql->timer, eql_timer, (unsigned long)eql);
+	timer_setup(&eql->timer, eql_timer, 0);
 	eql->timer.expires  	= jiffies + EQL_DEFAULT_RESCHED_IVAL;
 
 	spin_lock_init(&eql->queue.lock);
diff --git a/drivers/net/ethernet/adi/bfin_mac.c b/drivers/net/ethernet/adi/bfin_mac.c
index 0658cde1586a..7120f2b9c6ef 100644
--- a/drivers/net/ethernet/adi/bfin_mac.c
+++ b/drivers/net/ethernet/adi/bfin_mac.c
@@ -1092,9 +1092,11 @@ static void tx_reclaim_skb(struct bfin_mac_local *lp)
 	return;
 }
 
-static void tx_reclaim_skb_timeout(unsigned long lp)
+static void tx_reclaim_skb_timeout(struct timer_list *t)
 {
-	tx_reclaim_skb((struct bfin_mac_local *)lp);
+	struct bfin_mac_local *lp = from_timer(lp, t, tx_reclaim_timer);
+
+	tx_reclaim_skb(lp);
 }
 
 static int bfin_mac_hard_start_xmit(struct sk_buff *skb,
@@ -1650,8 +1652,7 @@ static int bfin_mac_probe(struct platform_device *pdev)
 	ndev->netdev_ops = &bfin_mac_netdev_ops;
 	ndev->ethtool_ops = &bfin_mac_ethtool_ops;
 
-	setup_timer(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout,
-		    (unsigned long)lp);
+	timer_setup(&lp->tx_reclaim_timer, tx_reclaim_skb_timeout, 0);
 
 	lp->flags = 0;
 	netif_napi_add(ndev, &lp->napi, bfin_mac_poll, CONFIG_BFIN_RX_DESC_NUM);
diff --git a/drivers/net/ethernet/agere/et131x.c b/drivers/net/ethernet/agere/et131x.c
index 658e92f79d36..48220b6c600d 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -3080,9 +3080,9 @@ err_out:
  * The routine called when the error timer expires, to track the number of
  * recurring errors.
  */
-static void et131x_error_timer_handler(unsigned long data)
+static void et131x_error_timer_handler(struct timer_list *t)
 {
-	struct et131x_adapter *adapter = (struct et131x_adapter *)data;
+	struct et131x_adapter *adapter = from_timer(adapter, t, error_timer);
 	struct phy_device *phydev = adapter->netdev->phydev;
 
 	if (et1310_in_phy_coma(adapter)) {
@@ -3624,8 +3624,7 @@ static int et131x_open(struct net_device *netdev)
 	int result;
 
 	/* Start the timer to track NIC errors */
-	setup_timer(&adapter->error_timer, et131x_error_timer_handler,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->error_timer, et131x_error_timer_handler, 0);
 	adapter->error_timer.expires = jiffies +
 		msecs_to_jiffies(TX_ERROR_PERIOD);
 	add_timer(&adapter->error_timer);
diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 1c1ddd891ca3..97c5a89a9cf7 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -2859,9 +2859,9 @@ static void ena_update_host_info(struct ena_admin_host_info *host_info,
 		(netdev->features & GENMASK_ULL(63, 32)) >> 32;
 }
 
-static void ena_timer_service(unsigned long data)
+static void ena_timer_service(struct timer_list *t)
 {
-	struct ena_adapter *adapter = (struct ena_adapter *)data;
+	struct ena_adapter *adapter = from_timer(adapter, t, timer_service);
 	u8 *debug_area = adapter->ena_dev->host_attr.debug_area_virt_addr;
 	struct ena_admin_host_info *host_info =
 		adapter->ena_dev->host_attr.host_info;
@@ -3278,8 +3278,7 @@ static int ena_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	ena_update_hints(adapter, &get_feat_ctx.hw_hints);
 
-	setup_timer(&adapter->timer_service, ena_timer_service,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->timer_service, ena_timer_service, 0);
 	mod_timer(&adapter->timer_service, round_jiffies(jiffies + HZ));
 
 	dev_info(&pdev->dev, "%s found at mem %lx, mac addr %pM Queues %d\n",
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
index 483e97691eea..78dfb2ab78ce 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.c
@@ -163,9 +163,9 @@ static int aq_nic_update_link_status(struct aq_nic_s *self)
 	return 0;
 }
 
-static void aq_nic_service_timer_cb(unsigned long param)
+static void aq_nic_service_timer_cb(struct timer_list *t)
 {
-	struct aq_nic_s *self = (struct aq_nic_s *)param;
+	struct aq_nic_s *self = from_timer(self, t, service_timer);
 	struct net_device *ndev = aq_nic_get_ndev(self);
 	int err = 0;
 	unsigned int i = 0U;
@@ -201,9 +201,9 @@ err_exit:
 		  jiffies + AQ_CFG_SERVICE_TIMER_INTERVAL);
 }
 
-static void aq_nic_polling_timer_cb(unsigned long param)
+static void aq_nic_polling_timer_cb(struct timer_list *t)
 {
-	struct aq_nic_s *self = (struct aq_nic_s *)param;
+	struct aq_nic_s *self = from_timer(self, t, polling_timer);
 	struct aq_vec_s *aq_vec = NULL;
 	unsigned int i = 0U;
 
@@ -440,14 +440,12 @@ int aq_nic_start(struct aq_nic_s *self)
 	err = aq_nic_update_interrupt_moderation_settings(self);
 	if (err)
 		goto err_exit;
-	setup_timer(&self->service_timer, &aq_nic_service_timer_cb,
-		    (unsigned long)self);
+	timer_setup(&self->service_timer, aq_nic_service_timer_cb, 0);
 	mod_timer(&self->service_timer, jiffies +
 			AQ_CFG_SERVICE_TIMER_INTERVAL);
 
 	if (self->aq_nic_cfg.is_polling) {
-		setup_timer(&self->polling_timer, &aq_nic_polling_timer_cb,
-			    (unsigned long)self);
+		timer_setup(&self->polling_timer, aq_nic_polling_timer_cb, 0);
 		mod_timer(&self->polling_timer, jiffies +
 			  AQ_CFG_POLLING_TIMER_INTERVAL);
 	} else {
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 8c9986f3fc01..94270f654b3b 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -222,9 +222,10 @@ static u32 atl1c_wait_until_idle(struct atl1c_hw *hw, u32 modu_ctrl)
  * atl1c_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1c_phy_config(unsigned long data)
+static void atl1c_phy_config(struct timer_list *t)
 {
-	struct atl1c_adapter *adapter = (struct atl1c_adapter *) data;
+	struct atl1c_adapter *adapter = from_timer(adapter, t,
+						   phy_config_timer);
 	struct atl1c_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -2613,8 +2614,7 @@ static int atl1c_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	adapter->mii.phy_id_mask = 0x1f;
 	adapter->mii.reg_num_mask = MDIO_CTRL_REG_MASK;
 	netif_napi_add(netdev, &adapter->napi, atl1c_clean, 64);
-	setup_timer(&adapter->phy_config_timer, atl1c_phy_config,
-			(unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1c_phy_config, 0);
 	/* setup the private structure */
 	err = atl1c_sw_init(adapter);
 	if (err) {
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 4f7e195af0bc..9dc6da039a6d 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -130,9 +130,10 @@ static inline void atl1e_irq_reset(struct atl1e_adapter *adapter)
  * atl1e_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1e_phy_config(unsigned long data)
+static void atl1e_phy_config(struct timer_list *t)
 {
-	struct atl1e_adapter *adapter = (struct atl1e_adapter *) data;
+	struct atl1e_adapter *adapter = from_timer(adapter, t,
+						   phy_config_timer);
 	struct atl1e_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -2361,8 +2362,7 @@ static int atl1e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netif_napi_add(netdev, &adapter->napi, atl1e_clean, 64);
 
-	setup_timer(&adapter->phy_config_timer, atl1e_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1e_phy_config, 0);
 
 	/* get user settings */
 	atl1e_check_options(adapter);
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 83d2db2abb45..b81fbf119bce 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2575,9 +2575,10 @@ static irqreturn_t atl1_intr(int irq, void *data)
  * atl1_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl1_phy_config(unsigned long data)
+static void atl1_phy_config(struct timer_list *t)
 {
-	struct atl1_adapter *adapter = (struct atl1_adapter *)data;
+	struct atl1_adapter *adapter = from_timer(adapter, t,
+						  phy_config_timer);
 	struct atl1_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -3071,8 +3072,7 @@ static int atl1_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* assume we have no link for now */
 	netif_carrier_off(netdev);
 
-	setup_timer(&adapter->phy_config_timer, atl1_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl1_phy_config, 0);
 	adapter->phy_timer_pending = false;
 
 	INIT_WORK(&adapter->reset_dev_task, atl1_reset_dev_task);
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 77a1c03255de..db4bcc51023a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -1028,9 +1028,9 @@ static void atl2_tx_timeout(struct net_device *netdev)
  * atl2_watchdog - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl2_watchdog(unsigned long data)
+static void atl2_watchdog(struct timer_list *t)
 {
-	struct atl2_adapter *adapter = (struct atl2_adapter *) data;
+	struct atl2_adapter *adapter = from_timer(adapter, t, watchdog_timer);
 
 	if (!test_bit(__ATL2_DOWN, &adapter->flags)) {
 		u32 drop_rxd, drop_rxs;
@@ -1053,9 +1053,10 @@ static void atl2_watchdog(unsigned long data)
  * atl2_phy_config - Timer Call-back
  * @data: pointer to netdev cast into an unsigned long
  */
-static void atl2_phy_config(unsigned long data)
+static void atl2_phy_config(struct timer_list *t)
 {
-	struct atl2_adapter *adapter = (struct atl2_adapter *) data;
+	struct atl2_adapter *adapter = from_timer(adapter, t,
+						  phy_config_timer);
 	struct atl2_hw *hw = &adapter->hw;
 	unsigned long flags;
 
@@ -1434,11 +1435,9 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	atl2_check_options(adapter);
 
-	setup_timer(&adapter->watchdog_timer, atl2_watchdog,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->watchdog_timer, atl2_watchdog, 0);
 
-	setup_timer(&adapter->phy_config_timer, atl2_phy_config,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->phy_config_timer, atl2_phy_config, 0);
 
 	INIT_WORK(&adapter->reset_task, atl2_reset_task);
 	INIT_WORK(&adapter->link_chg_task, atl2_link_chg_task);
diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c
index 42e44fc03a18..e445ab724827 100644
--- a/drivers/net/ethernet/broadcom/b44.c
+++ b/drivers/net/ethernet/broadcom/b44.c
@@ -599,9 +599,9 @@ static void b44_check_phy(struct b44 *bp)
 	}
 }
 
-static void b44_timer(unsigned long __opaque)
+static void b44_timer(struct timer_list *t)
 {
-	struct b44 *bp = (struct b44 *) __opaque;
+	struct b44 *bp = from_timer(bp, t, timer);
 
 	spin_lock_irq(&bp->lock);
 
@@ -1474,7 +1474,7 @@ static int b44_open(struct net_device *dev)
 		goto out;
 	}
 
-	setup_timer(&bp->timer, b44_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, b44_timer, 0);
 	bp->timer.expires = jiffies + HZ;
 	add_timer(&bp->timer);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index b3055a76dfbf..7919f6112ecf 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -6183,9 +6183,9 @@ bnx2_5708_serdes_timer(struct bnx2 *bp)
 }
 
 static void
-bnx2_timer(unsigned long data)
+bnx2_timer(struct timer_list *t)
 {
-	struct bnx2 *bp = (struct bnx2 *) data;
+	struct bnx2 *bp = from_timer(bp, t, timer);
 
 	if (!netif_running(bp->dev))
 		return;
@@ -8462,7 +8462,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 	bnx2_set_default_link(bp);
 	bp->req_flow_ctrl = FLOW_CTRL_RX | FLOW_CTRL_TX;
 
-	setup_timer(&bp->timer, bnx2_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnx2_timer, 0);
 	bp->timer.expires = RUN_AT(BNX2_TIMER_INTERVAL);
 
 #ifdef BCM_CNIC
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index be9fd7d184d0..91e2a7560b48 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -5761,9 +5761,9 @@ void bnx2x_drv_pulse(struct bnx2x *bp)
 		 bp->fw_drv_pulse_wr_seq);
 }
 
-static void bnx2x_timer(unsigned long data)
+static void bnx2x_timer(struct timer_list *t)
 {
-	struct bnx2x *bp = (struct bnx2x *) data;
+	struct bnx2x *bp = from_timer(bp, t, timer);
 
 	if (!netif_running(bp->dev))
 		return;
@@ -12421,7 +12421,7 @@ static int bnx2x_init_bp(struct bnx2x *bp)
 
 	bp->current_interval = CHIP_REV_IS_SLOW(bp) ? 5*HZ : HZ;
 
-	setup_timer(&bp->timer, bnx2x_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnx2x_timer, 0);
 	bp->timer.expires = jiffies + bp->current_interval;
 
 	if (SHMEM2_HAS(bp, dcbx_lldp_params_offset) &&
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 33c49ad697e4..c5c38d4b7d1c 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6962,9 +6962,9 @@ static void bnxt_poll_controller(struct net_device *dev)
 }
 #endif
 
-static void bnxt_timer(unsigned long data)
+static void bnxt_timer(struct timer_list *t)
 {
-	struct bnxt *bp = (struct bnxt *)data;
+	struct bnxt *bp = from_timer(bp, t, timer);
 	struct net_device *dev = bp->dev;
 
 	if (!netif_running(dev))
@@ -7236,7 +7236,7 @@ static int bnxt_init_board(struct pci_dev *pdev, struct net_device *dev)
 
 	bnxt_init_dflt_coal(bp);
 
-	setup_timer(&bp->timer, bnxt_timer, (unsigned long)bp);
+	timer_setup(&bp->timer, bnxt_timer, 0);
 	bp->current_interval = BNXT_TIMER_INTERVAL;
 
 	clear_bit(BNXT_STATE_OPEN, &bp->state);
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index d8d5f207c759..de51c2177d03 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -10931,9 +10931,9 @@ static void tg3_chk_missed_msi(struct tg3 *tp)
 	}
 }
 
-static void tg3_timer(unsigned long __opaque)
+static void tg3_timer(struct timer_list *t)
 {
-	struct tg3 *tp = (struct tg3 *) __opaque;
+	struct tg3 *tp = from_timer(tp, t, timer);
 
 	spin_lock(&tp->lock);
 
@@ -11087,7 +11087,7 @@ static void tg3_timer_init(struct tg3 *tp)
 	tp->asf_multiplier = (HZ / tp->timer_offset) *
 			     TG3_FW_UPDATE_FREQ_SEC;
 
-	setup_timer(&tp->timer, tg3_timer, (unsigned long)tp);
+	timer_setup(&tp->timer, tg3_timer, 0);
 }
 
 static void tg3_timer_start(struct tg3 *tp)
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 4a11baffe02d..e130fb757e7b 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1676,9 +1676,9 @@ static int enic_poll_msix_rq(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-static void enic_notify_timer(unsigned long data)
+static void enic_notify_timer(struct timer_list *t)
 {
-	struct enic *enic = (struct enic *)data;
+	struct enic *enic = from_timer(enic, t, notify_timer);
 
 	enic_notify_check(enic);
 
@@ -2846,8 +2846,7 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Setup notification timer, HW reset task, and wq locks
 	 */
 
-	setup_timer(&enic->notify_timer, enic_notify_timer,
-		    (unsigned long)enic);
+	timer_setup(&enic->notify_timer, enic_notify_timer, 0);
 
 	enic_set_rx_coal_setting(enic);
 	INIT_WORK(&enic->reset, enic_reset);
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 81c1fac00d33..62f204f32316 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -1346,9 +1346,9 @@ static void mib_counters_update(struct mv643xx_eth_private *mp)
 	spin_unlock_bh(&mp->mib_counters_lock);
 }
 
-static void mib_counters_timer_wrapper(unsigned long _mp)
+static void mib_counters_timer_wrapper(struct timer_list *t)
 {
-	struct mv643xx_eth_private *mp = (void *)_mp;
+	struct mv643xx_eth_private *mp = from_timer(mp, t, mib_counters_timer);
 	mib_counters_update(mp);
 	mod_timer(&mp->mib_counters_timer, jiffies + 30 * HZ);
 }
@@ -2321,9 +2321,9 @@ static int mv643xx_eth_poll(struct napi_struct *napi, int budget)
 	return work_done;
 }
 
-static inline void oom_timer_wrapper(unsigned long data)
+static inline void oom_timer_wrapper(struct timer_list *t)
 {
-	struct mv643xx_eth_private *mp = (void *)data;
+	struct mv643xx_eth_private *mp = from_timer(mp, t, rx_oom);
 
 	napi_schedule(&mp->napi);
 }
@@ -3178,8 +3178,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	mib_counters_clear(mp);
 
-	setup_timer(&mp->mib_counters_timer, mib_counters_timer_wrapper,
-		    (unsigned long)mp);
+	timer_setup(&mp->mib_counters_timer, mib_counters_timer_wrapper, 0);
 	mp->mib_counters_timer.expires = jiffies + 30 * HZ;
 
 	spin_lock_init(&mp->mib_counters_lock);
@@ -3188,7 +3187,7 @@ static int mv643xx_eth_probe(struct platform_device *pdev)
 
 	netif_napi_add(dev, &mp->napi, mv643xx_eth_poll, NAPI_POLL_WEIGHT);
 
-	setup_timer(&mp->rx_oom, oom_timer_wrapper, (unsigned long)mp);
+	timer_setup(&mp->rx_oom, oom_timer_wrapper, 0);
 
 
 	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c
index 91b1c154fd29..7bbd86f08e5f 100644
--- a/drivers/net/ethernet/marvell/pxa168_eth.c
+++ b/drivers/net/ethernet/marvell/pxa168_eth.c
@@ -362,9 +362,9 @@ static void rxq_refill(struct net_device *dev)
 	}
 }
 
-static inline void rxq_refill_timer_wrapper(unsigned long data)
+static inline void rxq_refill_timer_wrapper(struct timer_list *t)
 {
-	struct pxa168_eth_private *pep = (void *)data;
+	struct pxa168_eth_private *pep = from_timer(pep, t, timeout);
 	napi_schedule(&pep->napi);
 }
 
@@ -1496,8 +1496,7 @@ static int pxa168_eth_probe(struct platform_device *pdev)
 	netif_napi_add(dev, &pep->napi, pxa168_rx_poll, pep->rx_ring_size);
 
 	memset(&pep->timeout, 0, sizeof(struct timer_list));
-	setup_timer(&pep->timeout, rxq_refill_timer_wrapper,
-		    (unsigned long)pep);
+	timer_setup(&pep->timeout, rxq_refill_timer_wrapper, 0);
 
 	pep->smi_bus = mdiobus_alloc();
 	if (!pep->smi_bus) {
diff --git a/drivers/net/ethernet/marvell/skge.c b/drivers/net/ethernet/marvell/skge.c
index eef35bf3e849..6e423f098a60 100644
--- a/drivers/net/ethernet/marvell/skge.c
+++ b/drivers/net/ethernet/marvell/skge.c
@@ -1495,9 +1495,9 @@ static int xm_check_link(struct net_device *dev)
  * get an interrupt when carrier is detected, need to poll for
  * link coming up.
  */
-static void xm_link_timer(unsigned long arg)
+static void xm_link_timer(struct timer_list *t)
 {
-	struct skge_port *skge = (struct skge_port *) arg;
+	struct skge_port *skge = from_timer(skge, t, link_timer);
 	struct net_device *dev = skge->netdev;
 	struct skge_hw *hw = skge->hw;
 	int port = skge->port;
@@ -3897,7 +3897,7 @@ static struct net_device *skge_devinit(struct skge_hw *hw, int port,
 
 	/* Only used for Genesis XMAC */
 	if (is_genesis(hw))
-	    setup_timer(&skge->link_timer, xm_link_timer, (unsigned long) skge);
+	    timer_setup(&skge->link_timer, xm_link_timer, 0);
 	else {
 		dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG |
 		                   NETIF_F_RXCSUM;
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 1145cde2274a..9efe1771423c 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2974,9 +2974,9 @@ static int sky2_rx_hung(struct net_device *dev)
 	}
 }
 
-static void sky2_watchdog(unsigned long arg)
+static void sky2_watchdog(struct timer_list *t)
 {
-	struct sky2_hw *hw = (struct sky2_hw *) arg;
+	struct sky2_hw *hw = from_timer(hw, t, watchdog_timer);
 
 	/* Check for lost IRQ once a second */
 	if (sky2_read32(hw, B0_ISRC)) {
@@ -5083,7 +5083,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		sky2_show_addr(dev1);
 	}
 
-	setup_timer(&hw->watchdog_timer, sky2_watchdog, (unsigned long) hw);
+	timer_setup(&hw->watchdog_timer, sky2_watchdog, 0);
 	INIT_WORK(&hw->restart_work, sky2_restart);
 
 	pci_set_drvdata(pdev, hw);
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index b171ed2015fe..2521c8c40015 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -3501,7 +3501,7 @@ static void myri10ge_watchdog(struct work_struct *work)
  * cannot detect a NIC with a parity error in a timely fashion if the
  * NIC is lightly loaded.
  */
-static void myri10ge_watchdog_timer(unsigned long arg)
+static void myri10ge_watchdog_timer(struct timer_list *t)
 {
 	struct myri10ge_priv *mgp;
 	struct myri10ge_slice_state *ss;
@@ -3509,7 +3509,7 @@ static void myri10ge_watchdog_timer(unsigned long arg)
 	u32 rx_pause_cnt;
 	u16 cmd;
 
-	mgp = (struct myri10ge_priv *)arg;
+	mgp = from_timer(mgp, t, watchdog_timer);
 
 	rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
 	busy_slice_cnt = 0;
@@ -3930,8 +3930,7 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_save_state(pdev);
 
 	/* Setup the watchdog timer */
-	setup_timer(&mgp->watchdog_timer, myri10ge_watchdog_timer,
-		    (unsigned long)mgp);
+	timer_setup(&mgp->watchdog_timer, myri10ge_watchdog_timer, 0);
 
 	netdev->ethtool_ops = &myri10ge_ethtool_ops;
 	INIT_WORK(&mgp->watchdog_work, myri10ge_watchdog);
diff --git a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
index 457ee80307ea..40e52ffb732f 100644
--- a/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
+++ b/drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c
@@ -1089,9 +1089,10 @@ static void pch_gbe_set_mode(struct pch_gbe_adapter *adapter, u16 speed,
  * pch_gbe_watchdog - Watchdog process
  * @data:  Board private structure
  */
-static void pch_gbe_watchdog(unsigned long data)
+static void pch_gbe_watchdog(struct timer_list *t)
 {
-	struct pch_gbe_adapter *adapter = (struct pch_gbe_adapter *)data;
+	struct pch_gbe_adapter *adapter = from_timer(adapter, t,
+						     watchdog_timer);
 	struct net_device *netdev = adapter->netdev;
 	struct pch_gbe_hw *hw = &adapter->hw;
 
@@ -2644,8 +2645,7 @@ static int pch_gbe_probe(struct pci_dev *pdev,
 		dev_err(&pdev->dev, "Invalid MAC address, "
 		                    "interface disabled.\n");
 	}
-	setup_timer(&adapter->watchdog_timer, pch_gbe_watchdog,
-		    (unsigned long)adapter);
+	timer_setup(&adapter->watchdog_timer, pch_gbe_watchdog, 0);
 
 	INIT_WORK(&adapter->reset_task, pch_gbe_reset_task);
 
diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c
index 49591d9c2e1b..c9a55b774935 100644
--- a/drivers/net/ethernet/pasemi/pasemi_mac.c
+++ b/drivers/net/ethernet/pasemi/pasemi_mac.c
@@ -943,9 +943,9 @@ static irqreturn_t pasemi_mac_rx_intr(int irq, void *data)
 
 #define TX_CLEAN_INTERVAL HZ
 
-static void pasemi_mac_tx_timer(unsigned long data)
+static void pasemi_mac_tx_timer(struct timer_list *t)
 {
-	struct pasemi_mac_txring *txring = (struct pasemi_mac_txring *)data;
+	struct pasemi_mac_txring *txring = from_timer(txring, t, clean_timer);
 	struct pasemi_mac *mac = txring->mac;
 
 	pasemi_mac_clean_tx(txring);
@@ -1199,8 +1199,7 @@ static int pasemi_mac_open(struct net_device *dev)
 	if (dev->phydev)
 		phy_start(dev->phydev);
 
-	setup_timer(&mac->tx->clean_timer, pasemi_mac_tx_timer,
-		    (unsigned long)mac->tx);
+	timer_setup(&mac->tx->clean_timer, pasemi_mac_tx_timer, 0);
 	mod_timer(&mac->tx->clean_timer, jiffies + HZ);
 
 	return 0;
diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
index 05479d435469..9e5264d8773b 100644
--- a/drivers/net/ethernet/qlogic/qla3xxx.c
+++ b/drivers/net/ethernet/qlogic/qla3xxx.c
@@ -3749,9 +3749,9 @@ static void ql_get_board_info(struct ql3_adapter *qdev)
 	qdev->pci_slot = (u8) PCI_SLOT(qdev->pdev->devfn);
 }
 
-static void ql3xxx_timer(unsigned long ptr)
+static void ql3xxx_timer(struct timer_list *t)
 {
-	struct ql3_adapter *qdev = (struct ql3_adapter *)ptr;
+	struct ql3_adapter *qdev = from_timer(qdev, t, adapter_timer);
 	queue_delayed_work(qdev->workqueue, &qdev->link_state_work, 0);
 }
 
@@ -3891,7 +3891,7 @@ static int ql3xxx_probe(struct pci_dev *pdev,
 	INIT_DELAYED_WORK(&qdev->tx_timeout_work, ql_tx_timeout_work);
 	INIT_DELAYED_WORK(&qdev->link_state_work, ql_link_state_machine_work);
 
-	setup_timer(&qdev->adapter_timer, ql3xxx_timer, (unsigned long)qdev);
+	timer_setup(&qdev->adapter_timer, ql3xxx_timer, 0);
 	qdev->adapter_timer.expires = jiffies + HZ * 2;	/* two second delay */
 
 	if (!cards_found) {
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 0653b70723a3..6d6fb8cf3e7c 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -1983,9 +1983,9 @@ err_out:
 	return err;
 }
 
-static void ofdpa_fdb_cleanup(unsigned long data)
+static void ofdpa_fdb_cleanup(struct timer_list *t)
 {
-	struct ofdpa *ofdpa = (struct ofdpa *)data;
+	struct ofdpa *ofdpa = from_timer(ofdpa, t, fdb_cleanup_timer);
 	struct ofdpa_port *ofdpa_port;
 	struct ofdpa_fdb_tbl_entry *entry;
 	struct hlist_node *tmp;
@@ -2368,8 +2368,7 @@ static int ofdpa_init(struct rocker *rocker)
 	hash_init(ofdpa->neigh_tbl);
 	spin_lock_init(&ofdpa->neigh_tbl_lock);
 
-	setup_timer(&ofdpa->fdb_cleanup_timer, ofdpa_fdb_cleanup,
-		    (unsigned long) ofdpa);
+	timer_setup(&ofdpa->fdb_cleanup_timer, ofdpa_fdb_cleanup, 0);
 	mod_timer(&ofdpa->fdb_cleanup_timer, jiffies);
 
 	ofdpa->ageing_time = BR_DEFAULT_AGEING_TIME;
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index ff4fb5eae1af..f63c2ddced3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -345,9 +345,9 @@ void stmmac_disable_eee_mode(struct stmmac_priv *priv)
  *  if there is no data transfer and if we are not in LPI state,
  *  then MAC Transmitter can be moved to LPI state.
  */
-static void stmmac_eee_ctrl_timer(unsigned long arg)
+static void stmmac_eee_ctrl_timer(struct timer_list *t)
 {
-	struct stmmac_priv *priv = (struct stmmac_priv *)arg;
+	struct stmmac_priv *priv = from_timer(priv, t, eee_ctrl_timer);
 
 	stmmac_enable_eee_mode(priv);
 	mod_timer(&priv->eee_ctrl_timer, STMMAC_LPI_T(eee_timer));
@@ -401,9 +401,8 @@ bool stmmac_eee_init(struct stmmac_priv *priv)
 		spin_lock_irqsave(&priv->lock, flags);
 		if (!priv->eee_active) {
 			priv->eee_active = 1;
-			setup_timer(&priv->eee_ctrl_timer,
-				    stmmac_eee_ctrl_timer,
-				    (unsigned long)priv);
+			timer_setup(&priv->eee_ctrl_timer,
+				    stmmac_eee_ctrl_timer, 0);
 			mod_timer(&priv->eee_ctrl_timer,
 				  STMMAC_LPI_T(eee_timer));
 
@@ -2221,9 +2220,9 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
  * Description:
  * This is the timer handler to directly invoke the stmmac_tx_clean.
  */
-static void stmmac_tx_timer(unsigned long data)
+static void stmmac_tx_timer(struct timer_list *t)
 {
-	struct stmmac_priv *priv = (struct stmmac_priv *)data;
+	struct stmmac_priv *priv = from_timer(priv, t, txtimer);
 	u32 tx_queues_count = priv->plat->tx_queues_to_use;
 	u32 queue;
 
@@ -2244,7 +2243,7 @@ static void stmmac_init_tx_coalesce(struct stmmac_priv *priv)
 {
 	priv->tx_coal_frames = STMMAC_TX_FRAMES;
 	priv->tx_coal_timer = STMMAC_COAL_TX_TIMER;
-	setup_timer(&priv->txtimer, stmmac_tx_timer, (unsigned long)priv);
+	timer_setup(&priv->txtimer, stmmac_tx_timer, 0);
 	priv->txtimer.expires = STMMAC_COAL_TIMER(priv->tx_coal_timer);
 	add_timer(&priv->txtimer);
 }
diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
index e1b55b8fb8e0..1f8e9601592a 100644
--- a/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
+++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-net.c
@@ -358,9 +358,9 @@ static irqreturn_t xlgmac_dma_isr(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void xlgmac_tx_timer(unsigned long data)
+static void xlgmac_tx_timer(struct timer_list *t)
 {
-	struct xlgmac_channel *channel = (struct xlgmac_channel *)data;
+	struct xlgmac_channel *channel = from_timer(channel, t, tx_timer);
 	struct xlgmac_pdata *pdata = channel->pdata;
 	struct napi_struct *napi;
 
@@ -391,8 +391,7 @@ static void xlgmac_init_timers(struct xlgmac_pdata *pdata)
 		if (!channel->tx_ring)
 			break;
 
-		setup_timer(&channel->tx_timer, xlgmac_tx_timer,
-			    (unsigned long)channel);
+		timer_setup(&channel->tx_timer, xlgmac_tx_timer, 0);
 	}
 }
 
diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c
index cd1185e66133..b432a75fb874 100644
--- a/drivers/net/ethernet/ti/cpsw_ale.c
+++ b/drivers/net/ethernet/ti/cpsw_ale.c
@@ -765,9 +765,9 @@ int cpsw_ale_control_get(struct cpsw_ale *ale, int port, int control)
 }
 EXPORT_SYMBOL_GPL(cpsw_ale_control_get);
 
-static void cpsw_ale_timer(unsigned long arg)
+static void cpsw_ale_timer(struct timer_list *t)
 {
-	struct cpsw_ale *ale = (struct cpsw_ale *)arg;
+	struct cpsw_ale *ale = from_timer(ale, t, timer);
 
 	cpsw_ale_control_set(ale, 0, ALE_AGEOUT, 1);
 
@@ -859,7 +859,7 @@ void cpsw_ale_start(struct cpsw_ale *ale)
 	cpsw_ale_control_set(ale, 0, ALE_ENABLE, 1);
 	cpsw_ale_control_set(ale, 0, ALE_CLEAR, 1);
 
-	setup_timer(&ale->timer, cpsw_ale_timer, (unsigned long)ale);
+	timer_setup(&ale->timer, cpsw_ale_timer, 0);
 	if (ale->ageout) {
 		ale->timer.expires = jiffies + ale->ageout;
 		add_timer(&ale->timer);
diff --git a/drivers/net/ethernet/ti/netcp_ethss.c b/drivers/net/ethernet/ti/netcp_ethss.c
index 4ad821655e51..e831c49713ee 100644
--- a/drivers/net/ethernet/ti/netcp_ethss.c
+++ b/drivers/net/ethernet/ti/netcp_ethss.c
@@ -2745,9 +2745,9 @@ static int gbe_ioctl(void *intf_priv, struct ifreq *req, int cmd)
 	return -EOPNOTSUPP;
 }
 
-static void netcp_ethss_timer(unsigned long arg)
+static void netcp_ethss_timer(struct timer_list *t)
 {
-	struct gbe_priv *gbe_dev = (struct gbe_priv *)arg;
+	struct gbe_priv *gbe_dev = from_timer(gbe_dev, t, timer);
 	struct gbe_intf *gbe_intf;
 	struct gbe_slave *slave;
 
@@ -3616,8 +3616,7 @@ static int gbe_probe(struct netcp_device *netcp_device, struct device *dev,
 	}
 	spin_unlock_bh(&gbe_dev->hw_stats_lock);
 
-	setup_timer(&gbe_dev->timer, netcp_ethss_timer,
-		    (unsigned long)gbe_dev);
+	timer_setup(&gbe_dev->timer, netcp_ethss_timer, 0);
 	gbe_dev->timer.expires	 = jiffies + GBE_TIMER_INTERVAL;
 	add_timer(&gbe_dev->timer);
 	*inst_priv = gbe_dev;
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index a913538d3213..d925b8203996 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -912,8 +912,9 @@ spider_net_xmit(struct sk_buff *skb, struct net_device *netdev)
  * packets, including updating the queue tail pointer.
  */
 static void
-spider_net_cleanup_tx_ring(struct spider_net_card *card)
+spider_net_cleanup_tx_ring(struct timer_list *t)
 {
+	struct spider_net_card *card = from_timer(card, t, tx_timer);
 	if ((spider_net_release_tx_chain(card, 0) != 0) &&
 	    (card->netdev->flags & IFF_UP)) {
 		spider_net_kick_tx_dma(card);
@@ -1265,7 +1266,7 @@ static int spider_net_poll(struct napi_struct *napi, int budget)
 	spider_net_refill_rx_chain(card);
 	spider_net_enable_rxdmac(card);
 
-	spider_net_cleanup_tx_ring(card);
+	spider_net_cleanup_tx_ring(&card->tx_timer);
 
 	/* if all packets are in the stack, enable interrupts and return 0 */
 	/* if not, return 1 */
@@ -1977,9 +1978,9 @@ init_firmware_failed:
  * @data: used for pointer to card structure
  *
  */
-static void spider_net_link_phy(unsigned long data)
+static void spider_net_link_phy(struct timer_list *t)
 {
-	struct spider_net_card *card = (struct spider_net_card *)data;
+	struct spider_net_card *card = from_timer(card, t, aneg_timer);
 	struct mii_phy *phy = &card->phy;
 
 	/* if link didn't come up after SPIDER_NET_ANEG_TIMEOUT tries, setup phy again */
@@ -2256,14 +2257,11 @@ spider_net_setup_netdev(struct spider_net_card *card)
 
 	pci_set_drvdata(card->pdev, netdev);
 
-	setup_timer(&card->tx_timer,
-		    (void(*)(unsigned long))spider_net_cleanup_tx_ring,
-		    (unsigned long)card);
+	timer_setup(&card->tx_timer, spider_net_cleanup_tx_ring, 0);
 	netdev->irq = card->pdev->irq;
 
 	card->aneg_count = 0;
-	setup_timer(&card->aneg_timer, spider_net_link_phy,
-		    (unsigned long)card);
+	timer_setup(&card->aneg_timer, spider_net_link_phy, 0);
 
 	netif_napi_add(netdev, &card->napi,
 		       spider_net_poll, SPIDER_NET_NAPI_WEIGHT);
diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c
index eb8a18991d8c..cc63102ca96e 100644
--- a/drivers/net/slip/slip.c
+++ b/drivers/net/slip/slip.c
@@ -106,8 +106,8 @@ static int slip_esc6(unsigned char *p, unsigned char *d, int len);
 static void slip_unesc6(struct slip *sl, unsigned char c);
 #endif
 #ifdef CONFIG_SLIP_SMART
-static void sl_keepalive(unsigned long sls);
-static void sl_outfill(unsigned long sls);
+static void sl_keepalive(struct timer_list *t);
+static void sl_outfill(struct timer_list *t);
 static int sl_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 #endif
 
@@ -763,8 +763,8 @@ static struct slip *sl_alloc(dev_t line)
 	sl->mode        = SL_MODE_DEFAULT;
 #ifdef CONFIG_SLIP_SMART
 	/* initialize timer_list struct */
-	setup_timer(&sl->keepalive_timer, sl_keepalive, (unsigned long)sl);
-	setup_timer(&sl->outfill_timer, sl_outfill, (unsigned long)sl);
+	timer_setup(&sl->keepalive_timer, sl_keepalive, 0);
+	timer_setup(&sl->outfill_timer, sl_outfill, 0);
 #endif
 	slip_devs[i] = dev;
 	return sl;
@@ -1388,9 +1388,9 @@ module_exit(slip_exit);
  * added by Stanislav Voronyi. All changes before marked VSV
  */
 
-static void sl_outfill(unsigned long sls)
+static void sl_outfill(struct timer_list *t)
 {
-	struct slip *sl = (struct slip *)sls;
+	struct slip *sl = from_timer(sl, t, outfill_timer);
 
 	spin_lock(&sl->lock);
 
@@ -1419,9 +1419,9 @@ out:
 	spin_unlock(&sl->lock);
 }
 
-static void sl_keepalive(unsigned long sls)
+static void sl_keepalive(struct timer_list *t)
 {
-	struct slip *sl = (struct slip *)sls;
+	struct slip *sl = from_timer(sl, t, keepalive_timer);
 
 	spin_lock(&sl->lock);
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 5a2ea78a008f..c3af08f24679 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -444,9 +444,9 @@ static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
 	spin_unlock_bh(&tun->lock);
 }
 
-static void tun_flow_cleanup(unsigned long data)
+static void tun_flow_cleanup(struct timer_list *t)
 {
-	struct tun_struct *tun = (struct tun_struct *)data;
+	struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
 	unsigned long delay = tun->ageing_time;
 	unsigned long next_timer = jiffies + delay;
 	unsigned long count = 0;
@@ -1196,7 +1196,9 @@ static void tun_flow_init(struct tun_struct *tun)
 		INIT_HLIST_HEAD(&tun->flows[i]);
 
 	tun->ageing_time = TUN_FLOW_EXPIRE;
-	setup_timer(&tun->flow_gc_timer, tun_flow_cleanup, (unsigned long)tun);
+	timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
+	mod_timer(&tun->flow_gc_timer,
+		  round_jiffies_up(jiffies + tun->ageing_time));
 }
 
 static void tun_flow_uninit(struct tun_struct *tun)
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index c7721c729541..afeca6bcdade 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -558,9 +558,9 @@ out:
 	return NET_RX_DROP;
 }
 
-static void ppp_timer(unsigned long arg)
+static void ppp_timer(struct timer_list *t)
 {
-	struct proto *proto = (struct proto *)arg;
+	struct proto *proto = from_timer(proto, t, timer);
 	struct ppp *ppp = get_ppp(proto->dev);
 	unsigned long flags;
 
@@ -610,7 +610,7 @@ static void ppp_start(struct net_device *dev)
 	for (i = 0; i < IDX_COUNT; i++) {
 		struct proto *proto = &ppp->protos[i];
 		proto->dev = dev;
-		setup_timer(&proto->timer, ppp_timer, (unsigned long)proto);
+		timer_setup(&proto->timer, ppp_timer, 0);
 		proto->state = CLOSED;
 	}
 	ppp->protos[IDX_LCP].pid = PID_LCP;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
index 3559fb5b8fb0..03aae6bc1838 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/btcoex.c
@@ -280,9 +280,9 @@ static void brcmf_btcoex_restore_part1(struct brcmf_btcoex_info *btci)
 /**
  * brcmf_btcoex_timerfunc() - BT coex timer callback
  */
-static void brcmf_btcoex_timerfunc(ulong data)
+static void brcmf_btcoex_timerfunc(struct timer_list *t)
 {
-	struct brcmf_btcoex_info *bt_local = (struct brcmf_btcoex_info *)data;
+	struct brcmf_btcoex_info *bt_local = from_timer(bt_local, t, timer);
 	brcmf_dbg(TRACE, "enter\n");
 
 	bt_local->timer_on = false;
@@ -380,7 +380,7 @@ int brcmf_btcoex_attach(struct brcmf_cfg80211_info *cfg)
 	/* Set up timer for BT  */
 	btci->timer_on = false;
 	btci->timeout = BRCMF_BTCOEX_OPPR_WIN_TIME;
-	setup_timer(&btci->timer, brcmf_btcoex_timerfunc, (ulong)btci);
+	timer_setup(&btci->timer, brcmf_btcoex_timerfunc, 0);
 	btci->cfg = cfg;
 	btci->saved_regs_part1 = false;
 	btci->saved_regs_part2 = false;
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index 6e70df978159..15fa00d79fc6 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -2983,10 +2983,10 @@ static void brcmf_cfg80211_escan_timeout_worker(struct work_struct *work)
 	brcmf_notify_escan_complete(cfg, cfg->escan_info.ifp, true, true);
 }
 
-static void brcmf_escan_timeout(unsigned long data)
+static void brcmf_escan_timeout(struct timer_list *t)
 {
 	struct brcmf_cfg80211_info *cfg =
-			(struct brcmf_cfg80211_info *)data;
+			from_timer(cfg, t, escan_timeout);
 
 	if (cfg->int_escan_map || cfg->scan_request) {
 		brcmf_err("timer expired\n");
@@ -3150,8 +3150,7 @@ static void brcmf_init_escan(struct brcmf_cfg80211_info *cfg)
 			    brcmf_cfg80211_escan_handler);
 	cfg->escan_info.escan_state = WL_ESCAN_STATE_IDLE;
 	/* Init scan_timeout timer */
-	setup_timer(&cfg->escan_timeout, brcmf_escan_timeout,
-		    (unsigned long)cfg);
+	timer_setup(&cfg->escan_timeout, brcmf_escan_timeout, 0);
 	INIT_WORK(&cfg->escan_timeout_work,
 		  brcmf_cfg80211_escan_timeout_worker);
 }
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
index e3495ea95553..310c4e2746aa 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/sdio.c
@@ -3972,9 +3972,9 @@ brcmf_sdio_watchdog_thread(void *data)
 }
 
 static void
-brcmf_sdio_watchdog(unsigned long data)
+brcmf_sdio_watchdog(struct timer_list *t)
 {
-	struct brcmf_sdio *bus = (struct brcmf_sdio *)data;
+	struct brcmf_sdio *bus = from_timer(bus, t, timer);
 
 	if (bus->watchdog_tsk) {
 		complete(&bus->watchdog_wait);
@@ -4169,8 +4169,7 @@ struct brcmf_sdio *brcmf_sdio_probe(struct brcmf_sdio_dev *sdiodev)
 	init_waitqueue_head(&bus->dcmd_resp_wait);
 
 	/* Set up the watchdog timer */
-	setup_timer(&bus->timer, brcmf_sdio_watchdog,
-		    (unsigned long)bus);
+	timer_setup(&bus->timer, brcmf_sdio_watchdog, 0);
 	/* Initialize watchdog thread */
 	init_completion(&bus->watchdog_wait);
 	bus->watchdog_tsk = kthread_run(brcmf_sdio_watchdog_thread,
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
index 2acd94da9efe..d11d72615de2 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c
@@ -399,9 +399,9 @@ int iwl_send_statistics_request(struct iwl_priv *priv, u8 flags, bool clear)
  * was received.  We need to ensure we receive the statistics in order
  * to update the temperature used for calibrating the TXPOWER.
  */
-static void iwl_bg_statistics_periodic(unsigned long data)
+static void iwl_bg_statistics_periodic(struct timer_list *t)
 {
-	struct iwl_priv *priv = (struct iwl_priv *)data;
+	struct iwl_priv *priv = from_timer(priv, t, statistics_periodic);
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
@@ -556,9 +556,9 @@ static void iwl_continuous_event_trace(struct iwl_priv *priv)
  * this function is to perform continuous uCode event logging operation
  * if enabled
  */
-static void iwl_bg_ucode_trace(unsigned long data)
+static void iwl_bg_ucode_trace(struct timer_list *t)
 {
-	struct iwl_priv *priv = (struct iwl_priv *)data;
+	struct iwl_priv *priv = from_timer(priv, t, ucode_trace);
 
 	if (test_bit(STATUS_EXIT_PENDING, &priv->status))
 		return;
@@ -1085,11 +1085,9 @@ static void iwl_setup_deferred_work(struct iwl_priv *priv)
 	if (priv->lib->bt_params)
 		iwlagn_bt_setup_deferred_work(priv);
 
-	setup_timer(&priv->statistics_periodic, iwl_bg_statistics_periodic,
-		    (unsigned long)priv);
+	timer_setup(&priv->statistics_periodic, iwl_bg_statistics_periodic, 0);
 
-	setup_timer(&priv->ucode_trace, iwl_bg_ucode_trace,
-		    (unsigned long)priv);
+	timer_setup(&priv->ucode_trace, iwl_bg_ucode_trace, 0);
 }
 
 void iwl_cancel_deferred_work(struct iwl_priv *priv)
diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
index b5c459cd70ce..fed6d842a5e1 100644
--- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c
@@ -147,9 +147,9 @@ void iwl_pcie_free_dma_ptr(struct iwl_trans *trans, struct iwl_dma_ptr *ptr)
 	memset(ptr, 0, sizeof(*ptr));
 }
 
-static void iwl_pcie_txq_stuck_timer(unsigned long data)
+static void iwl_pcie_txq_stuck_timer(struct timer_list *t)
 {
-	struct iwl_txq *txq = (void *)data;
+	struct iwl_txq *txq = from_timer(txq, t, stuck_timer);
 	struct iwl_trans_pcie *trans_pcie = txq->trans_pcie;
 	struct iwl_trans *trans = iwl_trans_pcie_get_trans(trans_pcie);
 
@@ -495,8 +495,7 @@ int iwl_pcie_txq_alloc(struct iwl_trans *trans, struct iwl_txq *txq,
 	if (WARN_ON(txq->entries || txq->tfds))
 		return -EINVAL;
 
-	setup_timer(&txq->stuck_timer, iwl_pcie_txq_stuck_timer,
-		    (unsigned long)txq);
+	timer_setup(&txq->stuck_timer, iwl_pcie_txq_stuck_timer, 0);
 	txq->trans_pcie = trans_pcie;
 
 	txq->n_window = slots_num;
diff --git a/drivers/net/wireless/intersil/hostap/hostap_ap.c b/drivers/net/wireless/intersil/hostap/hostap_ap.c
index f9d047314692..b4dfe1893d18 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_ap.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_ap.c
@@ -185,9 +185,9 @@ static void hostap_event_expired_sta(struct net_device *dev,
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
 
-static void ap_handle_timer(unsigned long data)
+static void ap_handle_timer(struct timer_list *t)
 {
-	struct sta_info *sta = (struct sta_info *) data;
+	struct sta_info *sta = from_timer(sta, t, timer);
 	local_info_t *local;
 	struct ap_data *ap;
 	unsigned long next_time = 0;
@@ -1189,7 +1189,7 @@ static struct sta_info * ap_add_sta(struct ap_data *ap, u8 *addr)
 	}
 
 #ifndef PRISM2_NO_KERNEL_IEEE80211_MGMT
-	setup_timer(&sta->timer, ap_handle_timer, (unsigned long)sta);
+	timer_setup(&sta->timer, ap_handle_timer, 0);
 	sta->timer.expires = jiffies + ap->max_inactivity;
 	if (!ap->local->hostapd)
 		add_timer(&sta->timer);
diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c
index 8177fd6f65c1..5c4a17a18968 100644
--- a/drivers/net/wireless/intersil/hostap/hostap_hw.c
+++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c
@@ -2794,9 +2794,9 @@ static void prism2_check_sta_fw_version(local_info_t *local)
 }
 
 
-static void hostap_passive_scan(unsigned long data)
+static void hostap_passive_scan(struct timer_list *t)
 {
-	local_info_t *local = (local_info_t *) data;
+	local_info_t *local = from_timer(local, t, passive_scan_timer);
 	struct net_device *dev = local->dev;
 	u16 chan;
 
@@ -2869,10 +2869,10 @@ static void handle_comms_qual_update(struct work_struct *work)
  * used to monitor that local->last_tick_timer is being updated. If not,
  * interrupt busy-loop is assumed and driver tries to recover by masking out
  * some events. */
-static void hostap_tick_timer(unsigned long data)
+static void hostap_tick_timer(struct timer_list *t)
 {
 	static unsigned long last_inquire = 0;
-	local_info_t *local = (local_info_t *) data;
+	local_info_t *local = from_timer(local, t, tick_timer);
 	local->last_tick_timer = jiffies;
 
 	/* Inquire CommTallies every 10 seconds to keep the statistics updated
@@ -3225,10 +3225,8 @@ while (0)
 
 	lib80211_crypt_info_init(&local->crypt_info, dev->name, &local->lock);
 
-	setup_timer(&local->passive_scan_timer, hostap_passive_scan,
-		    (unsigned long)local);
-	setup_timer(&local->tick_timer, hostap_tick_timer,
-		    (unsigned long)local);
+	timer_setup(&local->passive_scan_timer, hostap_passive_scan, 0);
+	timer_setup(&local->tick_timer, hostap_tick_timer, 0);
 	local->tick_timer.expires = jiffies + 2 * HZ;
 	add_timer(&local->tick_timer);
 
diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
index 501180584b4b..94ad6fe29e69 100644
--- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
+++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c
@@ -319,9 +319,9 @@ static inline void ezusb_mod_timer(struct ezusb_priv *upriv,
 	mod_timer(timer, expire);
 }
 
-static void ezusb_request_timerfn(u_long _ctx)
+static void ezusb_request_timerfn(struct timer_list *t)
 {
-	struct request_context *ctx = (void *) _ctx;
+	struct request_context *ctx = from_timer(ctx, t, timer);
 
 	ctx->outurb->transfer_flags |= URB_ASYNC_UNLINK;
 	if (usb_unlink_urb(ctx->outurb) == -EINPROGRESS) {
@@ -365,7 +365,7 @@ static struct request_context *ezusb_alloc_ctx(struct ezusb_priv *upriv,
 	refcount_set(&ctx->refcount, 1);
 	init_completion(&ctx->done);
 
-	setup_timer(&ctx->timer, ezusb_request_timerfn, (u_long)ctx);
+	timer_setup(&ctx->timer, ezusb_request_timerfn, 0);
 	return ctx;
 }
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/core.c b/drivers/net/wireless/quantenna/qtnfmac/core.c
index 2d2c1ea65cb2..3423dc51198b 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/core.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/core.c
@@ -288,7 +288,7 @@ static struct qtnf_wmac *qtnf_core_mac_alloc(struct qtnf_bus *bus,
 		mac->iflist[i].vifid = i;
 		qtnf_sta_list_init(&mac->iflist[i].sta_list);
 		mutex_init(&mac->mac_lock);
-		setup_timer(&mac->scan_timeout, NULL, 0);
+		timer_setup(&mac->scan_timeout, NULL, 0);
 	}
 
 	qtnf_mac_init_primary_intf(mac);
diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c
index c346c021b999..d47921a84509 100644
--- a/drivers/net/wireless/ti/wlcore/main.c
+++ b/drivers/net/wireless/ti/wlcore/main.c
@@ -196,9 +196,9 @@ out:
 	mutex_unlock(&wl->mutex);
 }
 
-static void wl1271_rx_streaming_timer(unsigned long data)
+static void wl1271_rx_streaming_timer(struct timer_list *t)
 {
-	struct wl12xx_vif *wlvif = (struct wl12xx_vif *)data;
+	struct wl12xx_vif *wlvif = from_timer(wlvif, t, rx_streaming_timer);
 	struct wl1271 *wl = wlvif->wl;
 	ieee80211_queue_work(wl->hw, &wlvif->rx_streaming_disable_work);
 }
@@ -2279,8 +2279,7 @@ static int wl12xx_init_vif_data(struct wl1271 *wl, struct ieee80211_vif *vif)
 			  wlcore_pending_auth_complete_work);
 	INIT_LIST_HEAD(&wlvif->list);
 
-	setup_timer(&wlvif->rx_streaming_timer, wl1271_rx_streaming_timer,
-		    (unsigned long) wlvif);
+	timer_setup(&wlvif->rx_streaming_timer, wl1271_rx_streaming_timer, 0);
 	return 0;
 }
 
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 8b8689c6d887..18c85e55e76a 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -228,9 +228,9 @@ static bool xennet_can_sg(struct net_device *dev)
 }
 
 
-static void rx_refill_timeout(unsigned long data)
+static void rx_refill_timeout(struct timer_list *t)
 {
-	struct netfront_queue *queue = (struct netfront_queue *)data;
+	struct netfront_queue *queue = from_timer(queue, t, rx_refill_timer);
 	napi_schedule(&queue->napi);
 }
 
@@ -1605,8 +1605,7 @@ static int xennet_init_queue(struct netfront_queue *queue)
 	spin_lock_init(&queue->tx_lock);
 	spin_lock_init(&queue->rx_lock);
 
-	setup_timer(&queue->rx_refill_timer, rx_refill_timeout,
-		    (unsigned long)queue);
+	timer_setup(&queue->rx_refill_timer, rx_refill_timeout, 0);
 
 	snprintf(queue->name, sizeof(queue->name), "%s-q%u",
 		 queue->info->netdev->name, queue->id);
diff --git a/drivers/nfc/pn533/pn533.c b/drivers/nfc/pn533/pn533.c
index 2effa5ff7082..a0cc1cc45292 100644
--- a/drivers/nfc/pn533/pn533.c
+++ b/drivers/nfc/pn533/pn533.c
@@ -1232,9 +1232,9 @@ static int pn533_init_target_complete(struct pn533 *dev, struct sk_buff *resp)
 	return 0;
 }
 
-static void pn533_listen_mode_timer(unsigned long data)
+static void pn533_listen_mode_timer(struct timer_list *t)
 {
-	struct pn533 *dev = (struct pn533 *)data;
+	struct pn533 *dev = from_timer(dev, t, listen_timer);
 
 	dev_dbg(dev->dev, "Listen mode timeout\n");
 
@@ -2632,8 +2632,7 @@ struct pn533 *pn533_register_device(u32 device_type,
 	if (priv->wq == NULL)
 		goto error;
 
-	setup_timer(&priv->listen_timer, pn533_listen_mode_timer,
-		    (unsigned long)priv);
+	timer_setup(&priv->listen_timer, pn533_listen_mode_timer, 0);
 
 	skb_queue_head_init(&priv->resp_q);
 	skb_queue_head_init(&priv->fragment_skb);
diff --git a/drivers/nfc/st-nci/ndlc.c b/drivers/nfc/st-nci/ndlc.c
index 93a7536a9af9..f26d938d240f 100644
--- a/drivers/nfc/st-nci/ndlc.c
+++ b/drivers/nfc/st-nci/ndlc.c
@@ -246,18 +246,18 @@ void ndlc_recv(struct llt_ndlc *ndlc, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(ndlc_recv);
 
-static void ndlc_t1_timeout(unsigned long data)
+static void ndlc_t1_timeout(struct timer_list *t)
 {
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+	struct llt_ndlc *ndlc = from_timer(ndlc, t, t1_timer);
 
 	pr_debug("\n");
 
 	schedule_work(&ndlc->sm_work);
 }
 
-static void ndlc_t2_timeout(unsigned long data)
+static void ndlc_t2_timeout(struct timer_list *t)
 {
-	struct llt_ndlc *ndlc = (struct llt_ndlc *)data;
+	struct llt_ndlc *ndlc = from_timer(ndlc, t, t2_timer);
 
 	pr_debug("\n");
 
@@ -282,8 +282,8 @@ int ndlc_probe(void *phy_id, struct nfc_phy_ops *phy_ops, struct device *dev,
 	*ndlc_id = ndlc;
 
 	/* initialize timers */
-	setup_timer(&ndlc->t1_timer, ndlc_t1_timeout, (unsigned long)ndlc);
-	setup_timer(&ndlc->t2_timer, ndlc_t2_timeout, (unsigned long)ndlc);
+	timer_setup(&ndlc->t1_timer, ndlc_t1_timeout, 0);
+	timer_setup(&ndlc->t2_timer, ndlc_t2_timeout, 0);
 
 	skb_queue_head_init(&ndlc->rcv_q);
 	skb_queue_head_init(&ndlc->send_q);
diff --git a/drivers/ntb/test/ntb_pingpong.c b/drivers/ntb/test/ntb_pingpong.c
index 938a18bcfc3f..3f5a92bae6f8 100644
--- a/drivers/ntb/test/ntb_pingpong.c
+++ b/drivers/ntb/test/ntb_pingpong.c
@@ -107,9 +107,9 @@ struct pp_ctx {
 
 static struct dentry *pp_debugfs_dir;
 
-static void pp_ping(unsigned long ctx)
+static void pp_ping(struct timer_list *t)
 {
-	struct pp_ctx *pp = (void *)ctx;
+	struct pp_ctx *pp = from_timer(pp, t, db_timer);
 	unsigned long irqflags;
 	u64 db_bits, db_mask;
 	u32 spad_rd, spad_wr;
@@ -153,7 +153,7 @@ static void pp_link_event(void *ctx)
 
 	if (ntb_link_is_up(pp->ntb, NULL, NULL) == 1) {
 		dev_dbg(&pp->ntb->dev, "link is up\n");
-		pp_ping((unsigned long)pp);
+		pp_ping(&pp->db_timer);
 	} else {
 		dev_dbg(&pp->ntb->dev, "link is down\n");
 		del_timer(&pp->db_timer);
@@ -252,7 +252,7 @@ static int pp_probe(struct ntb_client *client,
 	pp->db_bits = 0;
 	atomic_set(&pp->count, 0);
 	spin_lock_init(&pp->db_lock);
-	setup_timer(&pp->db_timer, pp_ping, (unsigned long)pp);
+	timer_setup(&pp->db_timer, pp_ping, 0);
 	pp->db_delay = msecs_to_jiffies(delay_ms);
 
 	rc = ntb_set_ctx(ntb, pp, &pp_ops);
diff --git a/drivers/platform/x86/sony-laptop.c b/drivers/platform/x86/sony-laptop.c
index 62aa2c37b8d2..935121814c97 100644
--- a/drivers/platform/x86/sony-laptop.c
+++ b/drivers/platform/x86/sony-laptop.c
@@ -363,7 +363,7 @@ static int sony_laptop_input_keycode_map[] = {
 };
 
 /* release buttons after a short delay if pressed */
-static void do_sony_laptop_release_key(unsigned long unused)
+static void do_sony_laptop_release_key(struct timer_list *unused)
 {
 	struct sony_laptop_keypress kp;
 	unsigned long flags;
@@ -470,7 +470,7 @@ static int sony_laptop_setup_input(struct acpi_device *acpi_device)
 		goto err_dec_users;
 	}
 
-	setup_timer(&sony_laptop_input.release_key_timer,
+	timer_setup(&sony_laptop_input.release_key_timer,
 		    do_sony_laptop_release_key, 0);
 
 	/* input keys */
diff --git a/drivers/pps/clients/pps-ktimer.c b/drivers/pps/clients/pps-ktimer.c
index 436b4e4e71a1..04735649052a 100644
--- a/drivers/pps/clients/pps-ktimer.c
+++ b/drivers/pps/clients/pps-ktimer.c
@@ -39,7 +39,7 @@ static struct timer_list ktimer;
  * The kernel timer
  */
 
-static void pps_ktimer_event(unsigned long ptr)
+static void pps_ktimer_event(struct timer_list *unused)
 {
 	struct pps_event_time ts;
 
@@ -85,7 +85,7 @@ static int __init pps_ktimer_init(void)
 		return -ENOMEM;
 	}
 
-	setup_timer(&ktimer, pps_ktimer_event, 0);
+	timer_setup(&ktimer, pps_ktimer_event, 0);
 	mod_timer(&ktimer, jiffies + HZ);
 
 	dev_info(pps->dev, "ktimer PPS source registered\n");
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 00efe24a6063..215eac68ae2d 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -71,9 +71,9 @@ static void rtc_uie_task(struct work_struct *work)
 	if (num)
 		rtc_handle_legacy_irq(rtc, num, RTC_UF);
 }
-static void rtc_uie_timer(unsigned long data)
+static void rtc_uie_timer(struct timer_list *t)
 {
-	struct rtc_device *rtc = (struct rtc_device *)data;
+	struct rtc_device *rtc = from_timer(rtc, t, uie_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&rtc->irq_lock, flags);
@@ -460,7 +460,7 @@ void rtc_dev_prepare(struct rtc_device *rtc)
 
 #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL
 	INIT_WORK(&rtc->uie_task, rtc_uie_task);
-	setup_timer(&rtc->uie_timer, rtc_uie_timer, (unsigned long)rtc);
+	timer_setup(&rtc->uie_timer, rtc_uie_timer, 0);
 #endif
 
 	cdev_init(&rtc->char_dev, &rtc_dev_fops);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index adba91318768..0f1ff0813493 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -70,8 +70,8 @@ static void do_restore_device(struct work_struct *);
 static void do_reload_device(struct work_struct *);
 static void do_requeue_requests(struct work_struct *);
 static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
-static void dasd_device_timeout(unsigned long);
-static void dasd_block_timeout(unsigned long);
+static void dasd_device_timeout(struct timer_list *);
+static void dasd_block_timeout(struct timer_list *);
 static void __dasd_process_erp(struct dasd_device *, struct dasd_ccw_req *);
 static void dasd_profile_init(struct dasd_profile *, struct dentry *);
 static void dasd_profile_exit(struct dasd_profile *);
@@ -119,8 +119,7 @@ struct dasd_device *dasd_alloc_device(void)
 		     (void (*)(unsigned long)) dasd_device_tasklet,
 		     (unsigned long) device);
 	INIT_LIST_HEAD(&device->ccw_queue);
-	setup_timer(&device->timer, dasd_device_timeout,
-		    (unsigned long)device);
+	timer_setup(&device->timer, dasd_device_timeout, 0);
 	INIT_WORK(&device->kick_work, do_kick_device);
 	INIT_WORK(&device->restore_device, do_restore_device);
 	INIT_WORK(&device->reload_device, do_reload_device);
@@ -162,7 +161,7 @@ struct dasd_block *dasd_alloc_block(void)
 		     (unsigned long) block);
 	INIT_LIST_HEAD(&block->ccw_queue);
 	spin_lock_init(&block->queue_lock);
-	setup_timer(&block->timer, dasd_block_timeout, (unsigned long)block);
+	timer_setup(&block->timer, dasd_block_timeout, 0);
 	spin_lock_init(&block->profile.lock);
 
 	return block;
@@ -1557,12 +1556,12 @@ EXPORT_SYMBOL(dasd_start_IO);
  * The head of the ccw queue will have status DASD_CQR_IN_IO for 1),
  * DASD_CQR_QUEUED for 2) and 3).
  */
-static void dasd_device_timeout(unsigned long ptr)
+static void dasd_device_timeout(struct timer_list *t)
 {
 	unsigned long flags;
 	struct dasd_device *device;
 
-	device = (struct dasd_device *) ptr;
+	device = from_timer(device, t, timer);
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING);
@@ -2625,12 +2624,12 @@ EXPORT_SYMBOL(dasd_cancel_req);
  * is waiting for something that may not come reliably, (e.g. a state
  * change interrupt)
  */
-static void dasd_block_timeout(unsigned long ptr)
+static void dasd_block_timeout(struct timer_list *t)
 {
 	unsigned long flags;
 	struct dasd_block *block;
 
-	block = (struct dasd_block *) ptr;
+	block = from_timer(block, t, timer);
 	spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING);
diff --git a/drivers/s390/net/fsm.c b/drivers/s390/net/fsm.c
index 16b81be1f07a..c81adf8042d7 100644
--- a/drivers/s390/net/fsm.c
+++ b/drivers/s390/net/fsm.c
@@ -129,8 +129,9 @@ fsm_getstate_str(fsm_instance *fi)
 }
 
 static void
-fsm_expire_timer(fsm_timer *this)
+fsm_expire_timer(struct timer_list *t)
 {
+	fsm_timer *this = from_timer(this, t, tl);
 #if FSM_TIMER_DEBUG
 	printk(KERN_DEBUG "fsm(%s): Timer %p expired\n",
 	       this->fi->name, this);
@@ -146,7 +147,7 @@ fsm_settimer(fsm_instance *fi, fsm_timer *this)
 	printk(KERN_DEBUG "fsm(%s): Create timer %p\n", fi->name,
 	       this);
 #endif
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 }
 
 void
@@ -168,7 +169,7 @@ fsm_addtimer(fsm_timer *this, int millisec, int event, void *arg)
 	       this->fi->name, this, millisec);
 #endif
 
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 	this->expire_event = event;
 	this->event_arg = arg;
 	this->tl.expires = jiffies + (millisec * HZ) / 1000;
@@ -187,7 +188,7 @@ fsm_modtimer(fsm_timer *this, int millisec, int event, void *arg)
 #endif
 
 	del_timer(&this->tl);
-	setup_timer(&this->tl, (void *)fsm_expire_timer, (long)this);
+	timer_setup(&this->tl, fsm_expire_timer, 0);
 	this->expire_event = event;
 	this->event_arg = arg;
 	this->tl.expires = jiffies + (millisec * HZ) / 1000;
diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c
index a54b6c11b505..21f6421536a0 100644
--- a/drivers/scsi/arcmsr/arcmsr_hba.c
+++ b/drivers/scsi/arcmsr/arcmsr_hba.c
@@ -101,7 +101,7 @@ static void arcmsr_enable_outbound_ints(struct AdapterControlBlock *acb,
 static void arcmsr_stop_adapter_bgrb(struct AdapterControlBlock *acb);
 static void arcmsr_hbaA_flush_cache(struct AdapterControlBlock *acb);
 static void arcmsr_hbaB_flush_cache(struct AdapterControlBlock *acb);
-static void arcmsr_request_device_map(unsigned long pacb);
+static void arcmsr_request_device_map(struct timer_list *t);
 static void arcmsr_hbaA_request_device_map(struct AdapterControlBlock *acb);
 static void arcmsr_hbaB_request_device_map(struct AdapterControlBlock *acb);
 static void arcmsr_hbaC_request_device_map(struct AdapterControlBlock *acb);
@@ -837,8 +837,7 @@ static int arcmsr_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
-		    (unsigned long)acb);
+	timer_setup(&acb->eternal_timer, arcmsr_request_device_map, 0);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
 	add_timer(&acb->eternal_timer);
 	if(arcmsr_alloc_sysfs_attr(acb))
@@ -929,8 +928,7 @@ static int arcmsr_resume(struct pci_dev *pdev)
 	atomic_set(&acb->rq_map_token, 16);
 	atomic_set(&acb->ante_token_value, 16);
 	acb->fw_flag = FW_NORMAL;
-	setup_timer(&acb->eternal_timer, &arcmsr_request_device_map,
-		    (unsigned long)acb);
+	timer_setup(&acb->eternal_timer, arcmsr_request_device_map, 0);
 	acb->eternal_timer.expires = jiffies + msecs_to_jiffies(6 * HZ);
 	add_timer(&acb->eternal_timer);
 	return 0;
@@ -3457,9 +3455,9 @@ static void arcmsr_hbaD_request_device_map(struct AdapterControlBlock *acb)
 	}
 }
 
-static void arcmsr_request_device_map(unsigned long pacb)
+static void arcmsr_request_device_map(struct timer_list *t)
 {
-	struct AdapterControlBlock *acb = (struct AdapterControlBlock *)pacb;
+	struct AdapterControlBlock *acb = from_timer(acb, t, eternal_timer);
 	switch (acb->adapter_type) {
 		case ACB_ADAPTER_TYPE_A: {
 			arcmsr_hbaA_request_device_map(acb);
diff --git a/drivers/scsi/arm/fas216.c b/drivers/scsi/arm/fas216.c
index 7304d5a4fc4f..f4775ca70bab 100644
--- a/drivers/scsi/arm/fas216.c
+++ b/drivers/scsi/arm/fas216.c
@@ -2318,9 +2318,9 @@ DEF_SCSI_QCMD(fas216_noqueue_command)
  * Error handler timeout function.  Indicate that we timed out,
  * and wake up any error handler process so it can continue.
  */
-static void fas216_eh_timer(unsigned long data)
+static void fas216_eh_timer(struct timer_list *t)
 {
-	FAS216_Info *info = (FAS216_Info *)data;
+	FAS216_Info *info = from_timer(info, t, eh_timer);
 
 	fas216_log(info, LOG_ERROR, "error handling timed out\n");
 
@@ -2849,7 +2849,7 @@ int fas216_init(struct Scsi_Host *host)
 	info->rst_dev_status = -1;
 	info->rst_bus_status = -1;
 	init_waitqueue_head(&info->eh_wait);
-	setup_timer(&info->eh_timer, fas216_eh_timer, (unsigned long)info);
+	timer_setup(&info->eh_timer, fas216_eh_timer, 0);
 	
 	spin_lock_init(&info->host_lock);
 
diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c
index d10826a69725..cf0466686804 100644
--- a/drivers/scsi/bfa/bfad.c
+++ b/drivers/scsi/bfa/bfad.c
@@ -692,9 +692,9 @@ ext:
 }
 
 void
-bfad_bfa_tmo(unsigned long data)
+bfad_bfa_tmo(struct timer_list *t)
 {
-	struct bfad_s	      *bfad = (struct bfad_s *) data;
+	struct bfad_s	      *bfad = from_timer(bfad, t, hal_tmo);
 	unsigned long	flags;
 	struct list_head	       doneq;
 
@@ -719,7 +719,7 @@ bfad_bfa_tmo(unsigned long data)
 void
 bfad_init_timer(struct bfad_s *bfad)
 {
-	setup_timer(&bfad->hal_tmo, bfad_bfa_tmo, (unsigned long)bfad);
+	timer_setup(&bfad->hal_tmo, bfad_bfa_tmo, 0);
 
 	mod_timer(&bfad->hal_tmo,
 		  jiffies + msecs_to_jiffies(BFA_TIMER_FREQ));
diff --git a/drivers/scsi/bfa/bfad_drv.h b/drivers/scsi/bfa/bfad_drv.h
index cfcfff48e8e1..4fe980a6441f 100644
--- a/drivers/scsi/bfa/bfad_drv.h
+++ b/drivers/scsi/bfa/bfad_drv.h
@@ -314,7 +314,7 @@ int		bfad_setup_intr(struct bfad_s *bfad);
 void		bfad_remove_intr(struct bfad_s *bfad);
 void		bfad_update_hal_cfg(struct bfa_iocfc_cfg_s *bfa_cfg);
 bfa_status_t	bfad_hal_mem_alloc(struct bfad_s *bfad);
-void		bfad_bfa_tmo(unsigned long data);
+void		bfad_bfa_tmo(struct timer_list *t);
 void		bfad_init_timer(struct bfad_s *bfad);
 int		bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad);
 void		bfad_pci_uninit(struct pci_dev *pdev, struct bfad_s *bfad);
diff --git a/drivers/scsi/bnx2fc/bnx2fc_tgt.c b/drivers/scsi/bnx2fc/bnx2fc_tgt.c
index 59a2dfbcbc69..a8ae1a019eea 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_tgt.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_tgt.c
@@ -14,8 +14,8 @@
  */
 
 #include "bnx2fc.h"
-static void bnx2fc_upld_timer(unsigned long data);
-static void bnx2fc_ofld_timer(unsigned long data);
+static void bnx2fc_upld_timer(struct timer_list *t);
+static void bnx2fc_ofld_timer(struct timer_list *t);
 static int bnx2fc_init_tgt(struct bnx2fc_rport *tgt,
 			   struct fcoe_port *port,
 			   struct fc_rport_priv *rdata);
@@ -27,10 +27,10 @@ static void bnx2fc_free_session_resc(struct bnx2fc_hba *hba,
 			      struct bnx2fc_rport *tgt);
 static void bnx2fc_free_conn_id(struct bnx2fc_hba *hba, u32 conn_id);
 
-static void bnx2fc_upld_timer(unsigned long data)
+static void bnx2fc_upld_timer(struct timer_list *t)
 {
 
-	struct bnx2fc_rport *tgt = (struct bnx2fc_rport *)data;
+	struct bnx2fc_rport *tgt = from_timer(tgt, t, upld_timer);
 
 	BNX2FC_TGT_DBG(tgt, "upld_timer - Upload compl not received!!\n");
 	/* fake upload completion */
@@ -40,10 +40,10 @@ static void bnx2fc_upld_timer(unsigned long data)
 	wake_up_interruptible(&tgt->upld_wait);
 }
 
-static void bnx2fc_ofld_timer(unsigned long data)
+static void bnx2fc_ofld_timer(struct timer_list *t)
 {
 
-	struct bnx2fc_rport *tgt = (struct bnx2fc_rport *)data;
+	struct bnx2fc_rport *tgt = from_timer(tgt, t, ofld_timer);
 
 	BNX2FC_TGT_DBG(tgt, "entered bnx2fc_ofld_timer\n");
 	/* NOTE: This function should never be called, as
@@ -65,7 +65,7 @@ static void bnx2fc_ofld_timer(unsigned long data)
 
 static void bnx2fc_ofld_wait(struct bnx2fc_rport *tgt)
 {
-	setup_timer(&tgt->ofld_timer, bnx2fc_ofld_timer, (unsigned long)tgt);
+	timer_setup(&tgt->ofld_timer, bnx2fc_ofld_timer, 0);
 	mod_timer(&tgt->ofld_timer, jiffies + BNX2FC_FW_TIMEOUT);
 
 	wait_event_interruptible(tgt->ofld_wait,
@@ -277,7 +277,7 @@ void bnx2fc_flush_active_ios(struct bnx2fc_rport *tgt)
 
 static void bnx2fc_upld_wait(struct bnx2fc_rport *tgt)
 {
-	setup_timer(&tgt->upld_timer, bnx2fc_upld_timer, (unsigned long)tgt);
+	timer_setup(&tgt->upld_timer, bnx2fc_upld_timer, 0);
 	mod_timer(&tgt->upld_timer, jiffies + BNX2FC_FW_TIMEOUT);
 	wait_event_interruptible(tgt->upld_wait,
 				 (test_bit(
diff --git a/drivers/scsi/esas2r/esas2r_main.c b/drivers/scsi/esas2r/esas2r_main.c
index af4af504a97f..4eb14301a497 100644
--- a/drivers/scsi/esas2r/esas2r_main.c
+++ b/drivers/scsi/esas2r/esas2r_main.c
@@ -1631,11 +1631,11 @@ void esas2r_adapter_tasklet(unsigned long context)
 	}
 }
 
-static void esas2r_timer_callback(unsigned long context);
+static void esas2r_timer_callback(struct timer_list *t);
 
 void esas2r_kickoff_timer(struct esas2r_adapter *a)
 {
-	setup_timer(&a->timer, esas2r_timer_callback, (unsigned long)a);
+	timer_setup(&a->timer, esas2r_timer_callback, 0);
 
 	a->timer.expires = jiffies +
 			   msecs_to_jiffies(100);
@@ -1643,9 +1643,9 @@ void esas2r_kickoff_timer(struct esas2r_adapter *a)
 	add_timer(&a->timer);
 }
 
-static void esas2r_timer_callback(unsigned long context)
+static void esas2r_timer_callback(struct timer_list *t)
 {
-	struct esas2r_adapter *a = (struct esas2r_adapter *)context;
+	struct esas2r_adapter *a = from_timer(a, t, timer);
 
 	set_bit(AF2_TIMER_TICK, &a->flags2);
 
diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
index fff6f1851dc1..097f37de6ce9 100644
--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -49,7 +49,7 @@
 #define	FCOE_CTLR_MIN_FKA	500		/* min keep alive (mS) */
 #define	FCOE_CTLR_DEF_FKA	FIP_DEF_FKA	/* default keep alive (mS) */
 
-static void fcoe_ctlr_timeout(unsigned long);
+static void fcoe_ctlr_timeout(struct timer_list *);
 static void fcoe_ctlr_timer_work(struct work_struct *);
 static void fcoe_ctlr_recv_work(struct work_struct *);
 static int fcoe_ctlr_flogi_retry(struct fcoe_ctlr *);
@@ -156,7 +156,7 @@ void fcoe_ctlr_init(struct fcoe_ctlr *fip, enum fip_state mode)
 	mutex_init(&fip->ctlr_mutex);
 	spin_lock_init(&fip->ctlr_lock);
 	fip->flogi_oxid = FC_XID_UNKNOWN;
-	setup_timer(&fip->timer, fcoe_ctlr_timeout, (unsigned long)fip);
+	timer_setup(&fip->timer, fcoe_ctlr_timeout, 0);
 	INIT_WORK(&fip->timer_work, fcoe_ctlr_timer_work);
 	INIT_WORK(&fip->recv_work, fcoe_ctlr_recv_work);
 	skb_queue_head_init(&fip->fip_recv_list);
@@ -1786,9 +1786,9 @@ unlock:
  * fcoe_ctlr_timeout() - FIP timeout handler
  * @arg: The FCoE controller that timed out
  */
-static void fcoe_ctlr_timeout(unsigned long arg)
+static void fcoe_ctlr_timeout(struct timer_list *t)
 {
-	struct fcoe_ctlr *fip = (struct fcoe_ctlr *)arg;
+	struct fcoe_ctlr *fip = from_timer(fip, t, timer);
 
 	schedule_work(&fip->timer_work);
 }
diff --git a/drivers/scsi/fnic/fnic_main.c b/drivers/scsi/fnic/fnic_main.c
index aacadbf20b69..e52599f44170 100644
--- a/drivers/scsi/fnic/fnic_main.c
+++ b/drivers/scsi/fnic/fnic_main.c
@@ -407,18 +407,18 @@ static int fnic_notify_set(struct fnic *fnic)
 	return err;
 }
 
-static void fnic_notify_timer(unsigned long data)
+static void fnic_notify_timer(struct timer_list *t)
 {
-	struct fnic *fnic = (struct fnic *)data;
+	struct fnic *fnic = from_timer(fnic, t, notify_timer);
 
 	fnic_handle_link_event(fnic);
 	mod_timer(&fnic->notify_timer,
 		  round_jiffies(jiffies + FNIC_NOTIFY_TIMER_PERIOD));
 }
 
-static void fnic_fip_notify_timer(unsigned long data)
+static void fnic_fip_notify_timer(struct timer_list *t)
 {
-	struct fnic *fnic = (struct fnic *)data;
+	struct fnic *fnic = from_timer(fnic, t, fip_timer);
 
 	fnic_handle_fip_timer(fnic);
 }
@@ -777,8 +777,7 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		vnic_dev_add_addr(fnic->vdev, fnic->ctlr.ctl_src_addr);
 		fnic->set_vlan = fnic_set_vlan;
 		fcoe_ctlr_init(&fnic->ctlr, FIP_MODE_AUTO);
-		setup_timer(&fnic->fip_timer, fnic_fip_notify_timer,
-							(unsigned long)fnic);
+		timer_setup(&fnic->fip_timer, fnic_fip_notify_timer, 0);
 		spin_lock_init(&fnic->vlans_lock);
 		INIT_WORK(&fnic->fip_frame_work, fnic_handle_fip_frame);
 		INIT_WORK(&fnic->event_work, fnic_handle_event);
@@ -809,8 +808,7 @@ static int fnic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* Setup notify timer when using MSI interrupts */
 	if (vnic_dev_get_intr_mode(fnic->vdev) == VNIC_DEV_INTR_MODE_MSI)
-		setup_timer(&fnic->notify_timer,
-			    fnic_notify_timer, (unsigned long)fnic);
+		timer_setup(&fnic->notify_timer, fnic_notify_timer, 0);
 
 	/* allocate RQ buffers and post them to RQ*/
 	for (i = 0; i < fnic->rq_count; i++) {
diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c
index 017216f5e919..dc4e801b2cef 100644
--- a/drivers/scsi/ncr53c8xx.c
+++ b/drivers/scsi/ncr53c8xx.c
@@ -8093,9 +8093,9 @@ irqreturn_t ncr53c8xx_intr(int irq, void *dev_id)
      return IRQ_HANDLED;
 }
 
-static void ncr53c8xx_timeout(unsigned long npref)
+static void ncr53c8xx_timeout(struct timer_list *t)
 {
-	struct ncb *np = (struct ncb *) npref;
+	struct ncb *np = from_timer(np, t, timer);
 	unsigned long flags;
 	struct scsi_cmnd *done_list;
 
@@ -8357,7 +8357,7 @@ struct Scsi_Host * __init ncr_attach(struct scsi_host_template *tpnt,
 	if (!np->scripth0)
 		goto attach_error;
 
-	setup_timer(&np->timer, ncr53c8xx_timeout, (unsigned long)np);
+	timer_setup(&np->timer, ncr53c8xx_timeout, 0);
 
 	/* Try to map the controller chip to virtual and physical memory. */
 
diff --git a/drivers/staging/greybus/operation.c b/drivers/staging/greybus/operation.c
index 609332b3e15b..c462b1c046cd 100644
--- a/drivers/staging/greybus/operation.c
+++ b/drivers/staging/greybus/operation.c
@@ -293,9 +293,9 @@ static void gb_operation_work(struct work_struct *work)
 	gb_operation_put(operation);
 }
 
-static void gb_operation_timeout(unsigned long arg)
+static void gb_operation_timeout(struct timer_list *t)
 {
-	struct gb_operation *operation = (void *)arg;
+	struct gb_operation *operation = from_timer(operation, t, timer);
 
 	if (gb_operation_result_set(operation, -ETIMEDOUT)) {
 		/*
@@ -540,8 +540,7 @@ gb_operation_create_common(struct gb_connection *connection, u8 type,
 			goto err_request;
 		}
 
-		setup_timer(&operation->timer, gb_operation_timeout,
-			    (unsigned long)operation);
+		timer_setup(&operation->timer, gb_operation_timeout, 0);
 	}
 
 	operation->flags = op_flags;
diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c
index 3c83aa31e2c2..5a5d1811ffbe 100644
--- a/drivers/staging/lustre/lnet/lnet/net_fault.c
+++ b/drivers/staging/lustre/lnet/lnet/net_fault.c
@@ -700,9 +700,9 @@ lnet_delay_rule_daemon(void *arg)
 }
 
 static void
-delay_timer_cb(unsigned long arg)
+delay_timer_cb(struct timer_list *t)
 {
-	struct lnet_delay_rule *rule = (struct lnet_delay_rule *)arg;
+	struct lnet_delay_rule *rule = from_timer(rule, t, dl_timer);
 
 	spin_lock_bh(&delay_dd.dd_lock);
 	if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) {
@@ -762,7 +762,7 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr)
 		wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running);
 	}
 
-	setup_timer(&rule->dl_timer, delay_timer_cb, (unsigned long)rule);
+	timer_setup(&rule->dl_timer, delay_timer_cb, 0);
 
 	spin_lock_init(&rule->dl_lock);
 	INIT_LIST_HEAD(&rule->dl_msg_list);
diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c
index 23cdb7c4476c..63be6e7273f3 100644
--- a/drivers/staging/lustre/lustre/ptlrpc/service.c
+++ b/drivers/staging/lustre/lustre/ptlrpc/service.c
@@ -329,11 +329,11 @@ ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt)
 	return -1;
 }
 
-static void ptlrpc_at_timer(unsigned long castmeharder)
+static void ptlrpc_at_timer(struct timer_list *t)
 {
 	struct ptlrpc_service_part *svcpt;
 
-	svcpt = (struct ptlrpc_service_part *)castmeharder;
+	svcpt = from_timer(svcpt, t, scp_at_timer);
 
 	svcpt->scp_at_check = 1;
 	svcpt->scp_at_checktime = cfs_time_current();
@@ -506,8 +506,7 @@ ptlrpc_service_part_init(struct ptlrpc_service *svc,
 	if (!array->paa_reqs_count)
 		goto free_reqs_array;
 
-	setup_timer(&svcpt->scp_at_timer, ptlrpc_at_timer,
-		    (unsigned long)svcpt);
+	timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, 0);
 
 	/* At SOW, service time should be quick; 10s seems generous. If client
 	 * timeout is less than this, we'll be sending an early reply.
@@ -926,7 +925,7 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt)
 	next = (__s32)(array->paa_deadline - ktime_get_real_seconds() -
 		       at_early_margin);
 	if (next <= 0) {
-		ptlrpc_at_timer((unsigned long)svcpt);
+		ptlrpc_at_timer(&svcpt->scp_at_timer);
 	} else {
 		mod_timer(&svcpt->scp_at_timer, cfs_time_shift(next));
 		CDEBUG(D_INFO, "armed %s at %+ds\n",
diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 0790b3d9e255..143038c6c403 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -293,9 +293,9 @@ static irqreturn_t prp_nfb4eof_interrupt(int irq, void *dev_id)
  * EOF timeout timer function. This is an unrecoverable condition
  * without a stream restart.
  */
-static void prp_eof_timeout(unsigned long data)
+static void prp_eof_timeout(struct timer_list *t)
 {
-	struct prp_priv *priv = (struct prp_priv *)data;
+	struct prp_priv *priv = from_timer(priv, t, eof_timeout_timer);
 	struct imx_media_video_dev *vdev = priv->vdev;
 	struct imx_ic_priv *ic_priv = priv->ic_priv;
 
@@ -1292,8 +1292,7 @@ static int prp_init(struct imx_ic_priv *ic_priv)
 	priv->ic_priv = ic_priv;
 
 	spin_lock_init(&priv->irqlock);
-	setup_timer(&priv->eof_timeout_timer, prp_eof_timeout,
-		    (unsigned long)priv);
+	timer_setup(&priv->eof_timeout_timer, prp_eof_timeout, 0);
 
 	priv->vdev = imx_media_capture_device_init(&ic_priv->sd,
 						   PRPENCVF_SRC_PAD);
diff --git a/drivers/staging/media/imx/imx-media-csi.c b/drivers/staging/media/imx/imx-media-csi.c
index 6d856118c223..bb1d6dafca83 100644
--- a/drivers/staging/media/imx/imx-media-csi.c
+++ b/drivers/staging/media/imx/imx-media-csi.c
@@ -254,9 +254,9 @@ static irqreturn_t csi_idmac_nfb4eof_interrupt(int irq, void *dev_id)
  * EOF timeout timer function. This is an unrecoverable condition
  * without a stream restart.
  */
-static void csi_idmac_eof_timeout(unsigned long data)
+static void csi_idmac_eof_timeout(struct timer_list *t)
 {
-	struct csi_priv *priv = (struct csi_priv *)data;
+	struct csi_priv *priv = from_timer(priv, t, eof_timeout_timer);
 	struct imx_media_video_dev *vdev = priv->vdev;
 
 	v4l2_err(&priv->sd, "EOF timeout\n");
@@ -1739,8 +1739,7 @@ static int imx_csi_probe(struct platform_device *pdev)
 	priv->csi_id = pdata->csi;
 	priv->smfc_id = (priv->csi_id == 0) ? 0 : 2;
 
-	setup_timer(&priv->eof_timeout_timer, csi_idmac_eof_timeout,
-		    (unsigned long)priv);
+	timer_setup(&priv->eof_timeout_timer, csi_idmac_eof_timeout, 0);
 	spin_lock_init(&priv->irqlock);
 
 	v4l2_subdev_init(&priv->sd, &csi_subdev_ops);
diff --git a/drivers/staging/most/hdm-usb/hdm_usb.c b/drivers/staging/most/hdm-usb/hdm_usb.c
index 85775da293fb..667dacac81f0 100644
--- a/drivers/staging/most/hdm-usb/hdm_usb.c
+++ b/drivers/staging/most/hdm-usb/hdm_usb.c
@@ -744,9 +744,9 @@ static void hdm_request_netinfo(struct most_interface *iface, int channel,
  * The handler runs in interrupt context. That's why we need to defer the
  * tasks to a work queue.
  */
-static void link_stat_timer_handler(unsigned long data)
+static void link_stat_timer_handler(struct timer_list *t)
 {
-	struct most_dev *mdev = (struct most_dev *)data;
+	struct most_dev *mdev = from_timer(mdev, t, link_stat_timer);
 
 	schedule_work(&mdev->poll_work_obj);
 	mdev->link_stat_timer.expires = jiffies + (2 * HZ);
@@ -1138,8 +1138,7 @@ hdm_probe(struct usb_interface *interface, const struct usb_device_id *id)
 	num_endpoints = usb_iface_desc->desc.bNumEndpoints;
 	mutex_init(&mdev->io_mutex);
 	INIT_WORK(&mdev->poll_work_obj, wq_netinfo);
-	setup_timer(&mdev->link_stat_timer, link_stat_timer_handler,
-		    (unsigned long)mdev);
+	timer_setup(&mdev->link_stat_timer, link_stat_timer_handler, 0);
 
 	mdev->usb_device = usb_dev;
 	mdev->link_stat_timer.expires = jiffies + (2 * HZ);
diff --git a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
index 4e7908322d77..f56fdc7a4b61 100644
--- a/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
+++ b/drivers/staging/rtl8192u/ieee80211/ieee80211_softmac.c
@@ -391,10 +391,10 @@ static void ieee80211_send_beacon(struct ieee80211_device *ieee)
 }
 
 
-static void ieee80211_send_beacon_cb(unsigned long _ieee)
+static void ieee80211_send_beacon_cb(struct timer_list *t)
 {
 	struct ieee80211_device *ieee =
-		(struct ieee80211_device *) _ieee;
+		from_timer(ieee, t, beacon_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ieee->beacon_lock, flags);
@@ -1251,9 +1251,11 @@ void ieee80211_associate_abort(struct ieee80211_device *ieee)
 	spin_unlock_irqrestore(&ieee->lock, flags);
 }
 
-static void ieee80211_associate_abort_cb(unsigned long dev)
+static void ieee80211_associate_abort_cb(struct timer_list *t)
 {
-	ieee80211_associate_abort((struct ieee80211_device *) dev);
+	struct ieee80211_device *dev = from_timer(dev, t, associate_timer);
+
+	ieee80211_associate_abort(dev);
 }
 
 
@@ -2718,11 +2720,9 @@ void ieee80211_softmac_init(struct ieee80211_device *ieee)
 	ieee->enable_rx_imm_BA = true;
 	ieee->tx_pending.txb = NULL;
 
-	setup_timer(&ieee->associate_timer, ieee80211_associate_abort_cb,
-		    (unsigned long)ieee);
+	timer_setup(&ieee->associate_timer, ieee80211_associate_abort_cb, 0);
 
-	setup_timer(&ieee->beacon_timer, ieee80211_send_beacon_cb,
-		    (unsigned long)ieee);
+	timer_setup(&ieee->beacon_timer, ieee80211_send_beacon_cb, 0);
 
 
 	INIT_DELAYED_WORK(&ieee->start_ibss_wq, ieee80211_start_ibss_wq);
diff --git a/drivers/staging/rtl8712/recv_linux.c b/drivers/staging/rtl8712/recv_linux.c
index 576c15d25a0f..986a55bb9877 100644
--- a/drivers/staging/rtl8712/recv_linux.c
+++ b/drivers/staging/rtl8712/recv_linux.c
@@ -138,17 +138,16 @@ _recv_indicatepkt_drop:
 	precvpriv->rx_drop++;
 }
 
-static void _r8712_reordering_ctrl_timeout_handler (unsigned long data)
+static void _r8712_reordering_ctrl_timeout_handler (struct timer_list *t)
 {
 	struct recv_reorder_ctrl *preorder_ctrl =
-			 (struct recv_reorder_ctrl *)data;
+			 from_timer(preorder_ctrl, t, reordering_ctrl_timer);
 
 	r8712_reordering_ctrl_timeout_handler(preorder_ctrl);
 }
 
 void r8712_init_recv_timer(struct recv_reorder_ctrl *preorder_ctrl)
 {
-	setup_timer(&preorder_ctrl->reordering_ctrl_timer,
-		     _r8712_reordering_ctrl_timeout_handler,
-		     (unsigned long)preorder_ctrl);
+	timer_setup(&preorder_ctrl->reordering_ctrl_timer,
+		    _r8712_reordering_ctrl_timeout_handler, 0);
 }
diff --git a/drivers/staging/rtl8712/rtl8712_led.c b/drivers/staging/rtl8712/rtl8712_led.c
index da1d4a641dcd..455fba721135 100644
--- a/drivers/staging/rtl8712/rtl8712_led.c
+++ b/drivers/staging/rtl8712/rtl8712_led.c
@@ -74,7 +74,7 @@ enum _LED_STATE_871x {
  *	Prototype of protected function.
  *===========================================================================
  */
-static void BlinkTimerCallback(unsigned long data);
+static void BlinkTimerCallback(struct timer_list *t);
 
 static void BlinkWorkItemCallback(struct work_struct *work);
 /*===========================================================================
@@ -99,8 +99,7 @@ static void InitLed871x(struct _adapter *padapter, struct LED_871x *pLed,
 	pLed->bLedBlinkInProgress = false;
 	pLed->BlinkTimes = 0;
 	pLed->BlinkingLedState = LED_UNKNOWN;
-	setup_timer(&pLed->BlinkTimer, BlinkTimerCallback,
-		    (unsigned long)pLed);
+	timer_setup(&pLed->BlinkTimer, BlinkTimerCallback, 0);
 	INIT_WORK(&pLed->BlinkWorkItem, BlinkWorkItemCallback);
 }
 
@@ -825,9 +824,9 @@ static void SwLedBlink6(struct LED_871x *pLed)
  *		Callback function of LED BlinkTimer,
  *		it just schedules to corresponding BlinkWorkItem.
  */
-static void BlinkTimerCallback(unsigned long data)
+static void BlinkTimerCallback(struct timer_list *t)
 {
-	struct LED_871x  *pLed = (struct LED_871x *)data;
+	struct LED_871x  *pLed = from_timer(pLed, t, BlinkTimer);
 
 	/* This fixed the crash problem on Fedora 12 when trying to do the
 	 * insmod;ifconfig up;rmmod commands.
diff --git a/drivers/staging/unisys/visorbus/visorbus_main.c b/drivers/staging/unisys/visorbus/visorbus_main.c
index b604d0cccef1..6cb6eb0673c6 100644
--- a/drivers/staging/unisys/visorbus/visorbus_main.c
+++ b/drivers/staging/unisys/visorbus/visorbus_main.c
@@ -493,9 +493,9 @@ static const struct file_operations bus_info_debugfs_fops = {
 	.release = single_release,
 };
 
-static void dev_periodic_work(unsigned long __opaque)
+static void dev_periodic_work(struct timer_list *t)
 {
-	struct visor_device *dev = (struct visor_device *)__opaque;
+	struct visor_device *dev = from_timer(dev, t, timer);
 	struct visor_driver *drv = to_visor_driver(dev->device.driver);
 
 	drv->channel_interrupt(dev);
@@ -667,7 +667,7 @@ int create_visor_device(struct visor_device *dev)
 	dev->device.release = visorbus_release_device;
 	/* keep a reference just for us (now 2) */
 	get_device(&dev->device);
-	setup_timer(&dev->timer, dev_periodic_work, (unsigned long)dev);
+	timer_setup(&dev->timer, dev_periodic_work, 0);
 	/*
 	 * bus_id must be a unique name with respect to this bus TYPE (NOT bus
 	 * instance).  That's why we need to include the bus number within the
diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c
index 735d7e5fa86b..6d8239163ba5 100644
--- a/drivers/staging/unisys/visornic/visornic_main.c
+++ b/drivers/staging/unisys/visornic/visornic_main.c
@@ -1766,9 +1766,10 @@ static int visornic_poll(struct napi_struct *napi, int budget)
  * Main function of the vnic_incoming thread. Periodically check the response
  * queue and drain it if needed.
  */
-static void poll_for_irq(unsigned long v)
+static void poll_for_irq(struct timer_list *t)
 {
-	struct visornic_devdata *devdata = (struct visornic_devdata *)v;
+	struct visornic_devdata *devdata = from_timer(devdata, t,
+						      irq_poll_timer);
 
 	if (!visorchannel_signalempty(
 				   devdata->dev->visorchannel,
@@ -1899,8 +1900,7 @@ static int visornic_probe(struct visor_device *dev)
 	/* Let's start our threads to get responses */
 	netif_napi_add(netdev, &devdata->napi, visornic_poll, NAPI_WEIGHT);
 
-	setup_timer(&devdata->irq_poll_timer, poll_for_irq,
-		    (unsigned long)devdata);
+	timer_setup(&devdata->irq_poll_timer, poll_for_irq, 0);
 	/* Note: This time has to start running before the while
 	 * loop below because the napi routine is responsible for
 	 * setting enab_dis_acked
diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
index 8a275996d4e6..028da1dc1b81 100644
--- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
+++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
@@ -267,7 +267,7 @@ static void update_scan_time(void)
 		last_scanned_shadow[i].time_scan = jiffies;
 }
 
-static void remove_network_from_shadow(unsigned long unused)
+static void remove_network_from_shadow(struct timer_list *unused)
 {
 	unsigned long now = jiffies;
 	int i, j;
@@ -292,7 +292,7 @@ static void remove_network_from_shadow(unsigned long unused)
 	}
 }
 
-static void clear_duringIP(unsigned long arg)
+static void clear_duringIP(struct timer_list *unused)
 {
 	wilc_optaining_ip = false;
 }
@@ -2278,8 +2278,8 @@ int wilc_init_host_int(struct net_device *net)
 
 	priv = wdev_priv(net->ieee80211_ptr);
 	if (op_ifcs == 0) {
-		setup_timer(&hAgingTimer, remove_network_from_shadow, 0);
-		setup_timer(&wilc_during_ip_timer, clear_duringIP, 0);
+		timer_setup(&hAgingTimer, remove_network_from_shadow, 0);
+		timer_setup(&wilc_during_ip_timer, clear_duringIP, 0);
 	}
 	op_ifcs++;
 
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 9469695f5871..a8eaed2c211a 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -1044,9 +1044,9 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data)
 	return 0;
 }
 
-static void tcmu_device_timedout(unsigned long data)
+static void tcmu_device_timedout(struct timer_list *t)
 {
-	struct tcmu_dev *udev = (struct tcmu_dev *)data;
+	struct tcmu_dev *udev = from_timer(udev, t, timeout);
 	unsigned long flags;
 
 	spin_lock_irqsave(&udev->commands_lock, flags);
@@ -1106,8 +1106,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	idr_init(&udev->commands);
 	spin_lock_init(&udev->commands_lock);
 
-	setup_timer(&udev->timeout, tcmu_device_timedout,
-		(unsigned long)udev);
+	timer_setup(&udev->timeout, tcmu_device_timedout, 0);
 
 	init_waitqueue_head(&udev->nl_cmd_wq);
 	spin_lock_init(&udev->nl_cmd_lock);
diff --git a/drivers/tty/ipwireless/hardware.c b/drivers/tty/ipwireless/hardware.c
index a6b8240af6cd..b0baa4ce10f9 100644
--- a/drivers/tty/ipwireless/hardware.c
+++ b/drivers/tty/ipwireless/hardware.c
@@ -33,7 +33,7 @@ static void handle_received_SETUP_packet(struct ipw_hardware *ipw,
 					 unsigned int address,
 					 const unsigned char *data, int len,
 					 int is_last);
-static void ipwireless_setup_timer(unsigned long data);
+static void ipwireless_setup_timer(struct timer_list *t);
 static void handle_received_CTRL_packet(struct ipw_hardware *hw,
 		unsigned int channel_idx, const unsigned char *data, int len);
 
@@ -1635,8 +1635,7 @@ struct ipw_hardware *ipwireless_hardware_create(void)
 	spin_lock_init(&hw->lock);
 	tasklet_init(&hw->tasklet, ipwireless_do_tasklet, (unsigned long) hw);
 	INIT_WORK(&hw->work_rx, ipw_receive_data_work);
-	setup_timer(&hw->setup_timer, ipwireless_setup_timer,
-			(unsigned long) hw);
+	timer_setup(&hw->setup_timer, ipwireless_setup_timer, 0);
 
 	return hw;
 }
@@ -1670,12 +1669,12 @@ void ipwireless_init_hardware_v2_v3(struct ipw_hardware *hw)
 	hw->init_loops = 0;
 	printk(KERN_INFO IPWIRELESS_PCCARD_NAME
 	       ": waiting for card to start up...\n");
-	ipwireless_setup_timer((unsigned long) hw);
+	ipwireless_setup_timer(&hw->setup_timer);
 }
 
-static void ipwireless_setup_timer(unsigned long data)
+static void ipwireless_setup_timer(struct timer_list *t)
 {
-	struct ipw_hardware *hw = (struct ipw_hardware *) data;
+	struct ipw_hardware *hw = from_timer(hw, t, setup_timer);
 
 	hw->init_loops++;
 
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 3a39eb685c69..5131bdc9e765 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -1310,9 +1310,9 @@ static void gsm_control_transmit(struct gsm_mux *gsm, struct gsm_control *ctrl)
  *	gsm->pending_cmd will be NULL and we just let the timer expire.
  */
 
-static void gsm_control_retransmit(unsigned long data)
+static void gsm_control_retransmit(struct timer_list *t)
 {
-	struct gsm_mux *gsm = (struct gsm_mux *)data;
+	struct gsm_mux *gsm = from_timer(gsm, t, t2_timer);
 	struct gsm_control *ctrl;
 	unsigned long flags;
 	spin_lock_irqsave(&gsm->control_lock, flags);
@@ -1453,9 +1453,9 @@ static void gsm_dlci_open(struct gsm_dlci *dlci)
  *	end will get a DM response)
  */
 
-static void gsm_dlci_t1(unsigned long data)
+static void gsm_dlci_t1(struct timer_list *t)
 {
-	struct gsm_dlci *dlci = (struct gsm_dlci *)data;
+	struct gsm_dlci *dlci = from_timer(dlci, t, t1);
 	struct gsm_mux *gsm = dlci->gsm;
 
 	switch (dlci->state) {
@@ -1634,7 +1634,7 @@ static struct gsm_dlci *gsm_dlci_alloc(struct gsm_mux *gsm, int addr)
 	}
 
 	skb_queue_head_init(&dlci->skb_list);
-	setup_timer(&dlci->t1, gsm_dlci_t1, (unsigned long)dlci);
+	timer_setup(&dlci->t1, gsm_dlci_t1, 0);
 	tty_port_init(&dlci->port);
 	dlci->port.ops = &gsm_port_ops;
 	dlci->gsm = gsm;
@@ -2088,7 +2088,7 @@ static int gsm_activate_mux(struct gsm_mux *gsm)
 	struct gsm_dlci *dlci;
 	int i = 0;
 
-	setup_timer(&gsm->t2_timer, gsm_control_retransmit, (unsigned long)gsm);
+	timer_setup(&gsm->t2_timer, gsm_control_retransmit, 0);
 	init_waitqueue_head(&gsm->event);
 	spin_lock_init(&gsm->control_lock);
 	spin_lock_init(&gsm->tx_lock);
diff --git a/drivers/tty/n_r3964.c b/drivers/tty/n_r3964.c
index 9f246d4db3ca..30bb0900cd2f 100644
--- a/drivers/tty/n_r3964.c
+++ b/drivers/tty/n_r3964.c
@@ -115,7 +115,7 @@ static void retry_transmit(struct r3964_info *pInfo);
 static void transmit_block(struct r3964_info *pInfo);
 static void receive_char(struct r3964_info *pInfo, const unsigned char c);
 static void receive_error(struct r3964_info *pInfo, const char flag);
-static void on_timeout(unsigned long priv);
+static void on_timeout(struct timer_list *t);
 static int enable_signals(struct r3964_info *pInfo, struct pid *pid, int arg);
 static int read_telegram(struct r3964_info *pInfo, struct pid *pid,
 		unsigned char __user * buf);
@@ -688,9 +688,9 @@ static void receive_error(struct r3964_info *pInfo, const char flag)
 	}
 }
 
-static void on_timeout(unsigned long priv)
+static void on_timeout(struct timer_list *t)
 {
-	struct r3964_info *pInfo = (void *)priv;
+	struct r3964_info *pInfo = from_timer(pInfo, t, tmr);
 
 	switch (pInfo->state) {
 	case R3964_TX_REQUEST:
@@ -993,7 +993,7 @@ static int r3964_open(struct tty_struct *tty)
 	tty->disc_data = pInfo;
 	tty->receive_room = 65536;
 
-	setup_timer(&pInfo->tmr, on_timeout, (unsigned long)pInfo);
+	timer_setup(&pInfo->tmr, on_timeout, 0);
 
 	return 0;
 }
diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c
index 1421804975e0..c9458a033e3c 100644
--- a/drivers/tty/serial/crisv10.c
+++ b/drivers/tty/serial/crisv10.c
@@ -2059,7 +2059,7 @@ static void flush_timeout_function(unsigned long data)
 static struct timer_list flush_timer;
 
 static void
-timed_flush_handler(unsigned long ptr)
+timed_flush_handler(struct timer_list *unused)
 {
 	struct e100_serial *info;
 	int i;
@@ -4137,7 +4137,7 @@ static int __init rs_init(void)
 	/* Setup the timed flush handler system */
 
 #if !defined(CONFIG_ETRAX_SERIAL_FAST_TIMER)
-	setup_timer(&flush_timer, timed_flush_handler, 0);
+	timer_setup(&flush_timer, timed_flush_handler, 0);
 	mod_timer(&flush_timer, jiffies + 5);
 #endif
 
diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c
index c84e6f0db54e..1c4d3f387138 100644
--- a/drivers/tty/serial/fsl_lpuart.c
+++ b/drivers/tty/serial/fsl_lpuart.c
@@ -966,9 +966,9 @@ static void lpuart_dma_rx_complete(void *arg)
 	lpuart_copy_rx_to_tty(sport);
 }
 
-static void lpuart_timer_func(unsigned long data)
+static void lpuart_timer_func(struct timer_list *t)
 {
-	struct lpuart_port *sport = (struct lpuart_port *)data;
+	struct lpuart_port *sport = from_timer(sport, t, lpuart_timer);
 
 	lpuart_copy_rx_to_tty(sport);
 }
@@ -1263,8 +1263,7 @@ static void lpuart32_setup_watermark(struct lpuart_port *sport)
 
 static void rx_dma_timer_init(struct lpuart_port *sport)
 {
-		setup_timer(&sport->lpuart_timer, lpuart_timer_func,
-				(unsigned long)sport);
+		timer_setup(&sport->lpuart_timer, lpuart_timer_func, 0);
 		sport->lpuart_timer.expires = jiffies + sport->dma_rx_timeout;
 		add_timer(&sport->lpuart_timer);
 }
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index 473f4f81d690..ffefd218761e 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -263,9 +263,9 @@ static void mrdy_assert(struct ifx_spi_device *ifx_dev)
  *	The SPI has timed out: hang up the tty. Users will then see a hangup
  *	and error events.
  */
-static void ifx_spi_timeout(unsigned long arg)
+static void ifx_spi_timeout(struct timer_list *t)
 {
-	struct ifx_spi_device *ifx_dev = (struct ifx_spi_device *)arg;
+	struct ifx_spi_device *ifx_dev = from_timer(ifx_dev, t, spi_timer);
 
 	dev_warn(&ifx_dev->spi_dev->dev, "*** SPI Timeout ***");
 	tty_port_tty_hangup(&ifx_dev->tty_port, false);
@@ -1016,8 +1016,7 @@ static int ifx_spi_spi_probe(struct spi_device *spi)
 	spin_lock_init(&ifx_dev->write_lock);
 	spin_lock_init(&ifx_dev->power_lock);
 	ifx_dev->power_status = 0;
-	setup_timer(&ifx_dev->spi_timer, ifx_spi_timeout,
-		    (unsigned long)ifx_dev);
+	timer_setup(&ifx_dev->spi_timer, ifx_spi_timeout, 0);
 	ifx_dev->modem = pl_data->modem_type;
 	ifx_dev->use_dma = pl_data->use_dma;
 	ifx_dev->max_hz = pl_data->max_hz;
diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c
index a67a606c38eb..e4b3d9123a03 100644
--- a/drivers/tty/serial/imx.c
+++ b/drivers/tty/serial/imx.c
@@ -906,9 +906,9 @@ static void imx_break_ctl(struct uart_port *port, int break_state)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void imx_timeout(unsigned long data)
+static void imx_timeout(struct timer_list *t)
 {
-	struct imx_port *sport = (struct imx_port *)data;
+	struct imx_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -2082,7 +2082,7 @@ static int serial_imx_probe(struct platform_device *pdev)
 	sport->port.rs485_config = imx_rs485_config;
 	sport->port.rs485.flags |= SER_RS485_RTS_ON_SEND;
 	sport->port.flags = UPF_BOOT_AUTOCONF;
-	setup_timer(&sport->timer, imx_timeout, (unsigned long)sport);
+	timer_setup(&sport->timer, imx_timeout, 0);
 
 	sport->gpios = mctrl_gpio_init(&sport->port, 0);
 	if (IS_ERR(sport->gpios))
diff --git a/drivers/tty/serial/kgdb_nmi.c b/drivers/tty/serial/kgdb_nmi.c
index ed2b03058627..4029272891f9 100644
--- a/drivers/tty/serial/kgdb_nmi.c
+++ b/drivers/tty/serial/kgdb_nmi.c
@@ -188,9 +188,9 @@ bool kgdb_nmi_poll_knock(void)
  * The tasklet is cheap, it does not cause wakeups when reschedules itself,
  * instead it waits for the next tick.
  */
-static void kgdb_nmi_tty_receiver(unsigned long data)
+static void kgdb_nmi_tty_receiver(struct timer_list *t)
 {
-	struct kgdb_nmi_tty_priv *priv = (void *)data;
+	struct kgdb_nmi_tty_priv *priv = from_timer(priv, t, timer);
 	char ch;
 
 	priv->timer.expires = jiffies + (HZ/100);
@@ -241,7 +241,7 @@ static int kgdb_nmi_tty_install(struct tty_driver *drv, struct tty_struct *tty)
 		return -ENOMEM;
 
 	INIT_KFIFO(priv->fifo);
-	setup_timer(&priv->timer, kgdb_nmi_tty_receiver, (unsigned long)priv);
+	timer_setup(&priv->timer, kgdb_nmi_tty_receiver, 0);
 	tty_port_init(&priv->port);
 	priv->port.ops = &kgdb_nmi_tty_port_ops;
 	tty->driver_data = priv;
diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index 27d6049eb6a9..371569a0fd00 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -178,9 +178,9 @@ static void max3100_dowork(struct max3100_port *s)
 		queue_work(s->workqueue, &s->work);
 }
 
-static void max3100_timeout(unsigned long data)
+static void max3100_timeout(struct timer_list *t)
 {
-	struct max3100_port *s = (struct max3100_port *)data;
+	struct max3100_port *s = from_timer(s, t, timer);
 
 	if (s->port.state) {
 		max3100_dowork(s);
@@ -780,8 +780,7 @@ static int max3100_probe(struct spi_device *spi)
 		max3100s[i]->poll_time = 1;
 	max3100s[i]->max3100_hw_suspend = pdata->max3100_hw_suspend;
 	max3100s[i]->minor = i;
-	setup_timer(&max3100s[i]->timer, max3100_timeout,
-		    (unsigned long)max3100s[i]);
+	timer_setup(&max3100s[i]->timer, max3100_timeout, 0);
 
 	dev_dbg(&spi->dev, "%s: adding port %d\n", __func__, i);
 	max3100s[i]->port.irq = max3100s[i]->irq;
diff --git a/drivers/tty/serial/mux.c b/drivers/tty/serial/mux.c
index 3b74369c262f..00ce31e8d19a 100644
--- a/drivers/tty/serial/mux.c
+++ b/drivers/tty/serial/mux.c
@@ -371,7 +371,7 @@ static int mux_verify_port(struct uart_port *port, struct serial_struct *ser)
  *
  * This function periodically polls the Serial MUX to check for new data.
  */
-static void mux_poll(unsigned long unused)
+static void mux_poll(struct timer_list *unused)
 {  
 	int i;
 
@@ -572,7 +572,7 @@ static int __init mux_init(void)
 
 	if(port_cnt > 0) {
 		/* Start the Mux timer */
-		setup_timer(&mux_timer, mux_poll, 0UL);
+		timer_setup(&mux_timer, mux_poll, 0);
 		mod_timer(&mux_timer, jiffies + MUX_POLL_DELAY);
 
 #ifdef CONFIG_SERIAL_MUX_CONSOLE
diff --git a/drivers/tty/serial/pnx8xxx_uart.c b/drivers/tty/serial/pnx8xxx_uart.c
index f8812389b8a8..223a9499104e 100644
--- a/drivers/tty/serial/pnx8xxx_uart.c
+++ b/drivers/tty/serial/pnx8xxx_uart.c
@@ -103,9 +103,9 @@ static void pnx8xxx_mctrl_check(struct pnx8xxx_port *sport)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void pnx8xxx_timeout(unsigned long data)
+static void pnx8xxx_timeout(struct timer_list *t)
 {
-	struct pnx8xxx_port *sport = (struct pnx8xxx_port *)data;
+	struct pnx8xxx_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -662,8 +662,7 @@ static void __init pnx8xxx_init_ports(void)
 	first = 0;
 
 	for (i = 0; i < NR_PORTS; i++) {
-		setup_timer(&pnx8xxx_ports[i].timer, pnx8xxx_timeout,
-			    (unsigned long)&pnx8xxx_ports[i]);
+		timer_setup(&pnx8xxx_ports[i].timer, pnx8xxx_timeout, 0);
 		pnx8xxx_ports[i].port.ops = &pnx8xxx_pops;
 	}
 }
diff --git a/drivers/tty/serial/sa1100.c b/drivers/tty/serial/sa1100.c
index 4e3f169b30cf..a399772be3fc 100644
--- a/drivers/tty/serial/sa1100.c
+++ b/drivers/tty/serial/sa1100.c
@@ -110,9 +110,9 @@ static void sa1100_mctrl_check(struct sa1100_port *sport)
  * This is our per-port timeout handler, for checking the
  * modem status signals.
  */
-static void sa1100_timeout(unsigned long data)
+static void sa1100_timeout(struct timer_list *t)
 {
-	struct sa1100_port *sport = (struct sa1100_port *)data;
+	struct sa1100_port *sport = from_timer(sport, t, timer);
 	unsigned long flags;
 
 	if (sport->port.state) {
@@ -627,8 +627,7 @@ static void __init sa1100_init_ports(void)
 		sa1100_ports[i].port.fifosize  = 8;
 		sa1100_ports[i].port.line      = i;
 		sa1100_ports[i].port.iotype    = UPIO_MEM;
-		setup_timer(&sa1100_ports[i].timer, sa1100_timeout,
-			    (unsigned long)&sa1100_ports[i]);
+		timer_setup(&sa1100_ports[i].timer, sa1100_timeout, 0);
 	}
 
 	/*
diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c
index 31fcc7072a90..d9f399c4e90c 100644
--- a/drivers/tty/serial/sh-sci.c
+++ b/drivers/tty/serial/sh-sci.c
@@ -1058,9 +1058,9 @@ static int scif_rtrg_enabled(struct uart_port *port)
 			(SCFCR_RTRG0 | SCFCR_RTRG1)) != 0;
 }
 
-static void rx_fifo_timer_fn(unsigned long arg)
+static void rx_fifo_timer_fn(struct timer_list *t)
 {
-	struct sci_port *s = (struct sci_port *)arg;
+	struct sci_port *s = from_timer(s, t, rx_fifo_timer);
 	struct uart_port *port = &s->port;
 
 	dev_dbg(port->dev, "Rx timed out\n");
@@ -1138,8 +1138,7 @@ static ssize_t rx_fifo_timeout_store(struct device *dev,
 		sci->rx_fifo_timeout = r;
 		scif_set_rtrg(port, 1);
 		if (r > 0)
-			setup_timer(&sci->rx_fifo_timer, rx_fifo_timer_fn,
-				    (unsigned long)sci);
+			timer_setup(&sci->rx_fifo_timer, rx_fifo_timer_fn, 0);
 	}
 
 	return count;
@@ -1392,9 +1391,9 @@ static void work_fn_tx(struct work_struct *work)
 	dma_async_issue_pending(chan);
 }
 
-static void rx_timer_fn(unsigned long arg)
+static void rx_timer_fn(struct timer_list *t)
 {
-	struct sci_port *s = (struct sci_port *)arg;
+	struct sci_port *s = from_timer(s, t, rx_timer);
 	struct dma_chan *chan = s->chan_rx;
 	struct uart_port *port = &s->port;
 	struct dma_tx_state state;
@@ -1572,7 +1571,7 @@ static void sci_request_dma(struct uart_port *port)
 			dma += s->buf_len_rx;
 		}
 
-		setup_timer(&s->rx_timer, rx_timer_fn, (unsigned long)s);
+		timer_setup(&s->rx_timer, rx_timer_fn, 0);
 
 		if (port->type == PORT_SCIFA || port->type == PORT_SCIFB)
 			sci_submit_rx(s);
@@ -2238,8 +2237,7 @@ static void sci_reset(struct uart_port *port)
 	if (s->rx_trigger > 1) {
 		if (s->rx_fifo_timeout) {
 			scif_set_rtrg(port, 1);
-			setup_timer(&s->rx_fifo_timer, rx_fifo_timer_fn,
-				    (unsigned long)s);
+			timer_setup(&s->rx_fifo_timer, rx_fifo_timer_fn, 0);
 		} else {
 			if (port->type == PORT_SCIFA ||
 			    port->type == PORT_SCIFB)
diff --git a/drivers/tty/serial/sn_console.c b/drivers/tty/serial/sn_console.c
index ed78542c4c37..42b9aded4eb1 100644
--- a/drivers/tty/serial/sn_console.c
+++ b/drivers/tty/serial/sn_console.c
@@ -612,9 +612,9 @@ static irqreturn_t sn_sal_interrupt(int irq, void *dev_id)
  * Obviously not used in interrupt mode
  *
  */
-static void sn_sal_timer_poll(unsigned long data)
+static void sn_sal_timer_poll(struct timer_list *t)
 {
-	struct sn_cons_port *port = (struct sn_cons_port *)data;
+	struct sn_cons_port *port = from_timer(port, t, sc_timer);
 	unsigned long flags;
 
 	if (!port)
@@ -668,7 +668,7 @@ static void __init sn_sal_switch_to_asynch(struct sn_cons_port *port)
 	 * timer to poll for input and push data from the console
 	 * buffer.
 	 */
-	setup_timer(&port->sc_timer, sn_sal_timer_poll, (unsigned long)port);
+	timer_setup(&port->sc_timer, sn_sal_timer_poll, 0);
 
 	if (IS_RUNNING_ON_SIMULATOR())
 		port->sc_interrupt_timeout = 6;
diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c
index f2c34d656144..3c4ad71f261d 100644
--- a/drivers/tty/synclink.c
+++ b/drivers/tty/synclink.c
@@ -700,7 +700,7 @@ static void usc_enable_async_clock( struct mgsl_struct *info, u32 DataRate );
 
 static void usc_loopback_frame( struct mgsl_struct *info );
 
-static void mgsl_tx_timeout(unsigned long context);
+static void mgsl_tx_timeout(struct timer_list *t);
 
 
 static void usc_loopmode_cancel_transmit( struct mgsl_struct * info );
@@ -1768,7 +1768,7 @@ static int startup(struct mgsl_struct * info)
 	
 	memset(&info->icount, 0, sizeof(info->icount));
 
-	setup_timer(&info->tx_timer, mgsl_tx_timeout, (unsigned long)info);
+	timer_setup(&info->tx_timer, mgsl_tx_timeout, 0);
 	
 	/* Allocate and claim adapter resources */
 	retval = mgsl_claim_resources(info);
@@ -7517,9 +7517,9 @@ static void mgsl_trace_block(struct mgsl_struct *info,const char* data, int coun
  * Arguments:	context		pointer to device instance data
  * Return Value:	None
  */
-static void mgsl_tx_timeout(unsigned long context)
+static void mgsl_tx_timeout(struct timer_list *t)
 {
-	struct mgsl_struct *info = (struct mgsl_struct*)context;
+	struct mgsl_struct *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 	
 	if ( debug_level >= DEBUG_LEVEL_INFO )
diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c
index 06a03731bba7..255c49687877 100644
--- a/drivers/tty/synclink_gt.c
+++ b/drivers/tty/synclink_gt.c
@@ -493,8 +493,8 @@ static void free_bufs(struct slgt_info *info, struct slgt_desc *bufs, int count)
 static int  alloc_tmp_rbuf(struct slgt_info *info);
 static void free_tmp_rbuf(struct slgt_info *info);
 
-static void tx_timeout(unsigned long context);
-static void rx_timeout(unsigned long context);
+static void tx_timeout(struct timer_list *t);
+static void rx_timeout(struct timer_list *t);
 
 /*
  * ioctl handlers
@@ -3597,8 +3597,8 @@ static struct slgt_info *alloc_dev(int adapter_num, int port_num, struct pci_dev
 		info->adapter_num = adapter_num;
 		info->port_num = port_num;
 
-		setup_timer(&info->tx_timer, tx_timeout, (unsigned long)info);
-		setup_timer(&info->rx_timer, rx_timeout, (unsigned long)info);
+		timer_setup(&info->tx_timer, tx_timeout, 0);
+		timer_setup(&info->rx_timer, rx_timeout, 0);
 
 		/* Copy configuration info to device instance data */
 		info->pdev = pdev;
@@ -5112,9 +5112,9 @@ static int adapter_test(struct slgt_info *info)
 /*
  * transmit timeout handler
  */
-static void tx_timeout(unsigned long context)
+static void tx_timeout(struct timer_list *t)
 {
-	struct slgt_info *info = (struct slgt_info*)context;
+	struct slgt_info *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 
 	DBGINFO(("%s tx_timeout\n", info->device_name));
@@ -5136,9 +5136,9 @@ static void tx_timeout(unsigned long context)
 /*
  * receive buffer polling timer
  */
-static void rx_timeout(unsigned long context)
+static void rx_timeout(struct timer_list *t)
 {
-	struct slgt_info *info = (struct slgt_info*)context;
+	struct slgt_info *info = from_timer(info, t, rx_timer);
 	unsigned long flags;
 
 	DBGINFO(("%s rx_timeout\n", info->device_name));
diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c
index d45f234e1914..75f11ce1f0a1 100644
--- a/drivers/tty/synclinkmp.c
+++ b/drivers/tty/synclinkmp.c
@@ -615,8 +615,8 @@ static void free_tmp_rx_buf(SLMP_INFO *info);
 
 static void load_pci_memory(SLMP_INFO *info, char* dest, const char* src, unsigned short count);
 static void trace_block(SLMP_INFO *info, const char* data, int count, int xmit);
-static void tx_timeout(unsigned long context);
-static void status_timeout(unsigned long context);
+static void tx_timeout(struct timer_list *t);
+static void status_timeout(struct timer_list *t);
 
 static unsigned char read_reg(SLMP_INFO *info, unsigned char addr);
 static void write_reg(SLMP_INFO *info, unsigned char addr, unsigned char val);
@@ -3782,9 +3782,8 @@ static SLMP_INFO *alloc_dev(int adapter_num, int port_num, struct pci_dev *pdev)
 		info->bus_type = MGSL_BUS_TYPE_PCI;
 		info->irq_flags = IRQF_SHARED;
 
-		setup_timer(&info->tx_timer, tx_timeout, (unsigned long)info);
-		setup_timer(&info->status_timer, status_timeout,
-				(unsigned long)info);
+		timer_setup(&info->tx_timer, tx_timeout, 0);
+		timer_setup(&info->status_timer, status_timeout, 0);
 
 		/* Store the PCI9050 misc control register value because a flaw
 		 * in the PCI9050 prevents LCR registers from being read if
@@ -5468,9 +5467,9 @@ static void trace_block(SLMP_INFO *info,const char* data, int count, int xmit)
 /* called when HDLC frame times out
  * update stats and do tx completion processing
  */
-static void tx_timeout(unsigned long context)
+static void tx_timeout(struct timer_list *t)
 {
-	SLMP_INFO *info = (SLMP_INFO*)context;
+	SLMP_INFO *info = from_timer(info, t, tx_timer);
 	unsigned long flags;
 
 	if ( debug_level >= DEBUG_LEVEL_INFO )
@@ -5495,10 +5494,10 @@ static void tx_timeout(unsigned long context)
 
 /* called to periodically check the DSR/RI modem signal input status
  */
-static void status_timeout(unsigned long context)
+static void status_timeout(struct timer_list *t)
 {
 	u16 status = 0;
-	SLMP_INFO *info = (SLMP_INFO*)context;
+	SLMP_INFO *info = from_timer(info, t, status_timer);
 	unsigned long flags;
 	unsigned char delta;
 
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 19b5c4afeef2..fc32391a34d5 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -788,9 +788,11 @@ void usb_hcd_poll_rh_status(struct usb_hcd *hcd)
 EXPORT_SYMBOL_GPL(usb_hcd_poll_rh_status);
 
 /* timer callback */
-static void rh_timer_func (unsigned long _hcd)
+static void rh_timer_func (struct timer_list *t)
 {
-	usb_hcd_poll_rh_status((struct usb_hcd *) _hcd);
+	struct usb_hcd *_hcd = from_timer(_hcd, t, rh_timer);
+
+	usb_hcd_poll_rh_status(_hcd);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -2545,7 +2547,7 @@ struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver,
 	hcd->self.bus_name = bus_name;
 	hcd->self.uses_dma = (sysdev->dma_mask != NULL);
 
-	setup_timer(&hcd->rh_timer, rh_timer_func, (unsigned long)hcd);
+	timer_setup(&hcd->rh_timer, rh_timer_func, 0);
 #ifdef CONFIG_PM
 	INIT_WORK(&hcd->wakeup_work, hcd_resume_work);
 #endif
diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c
index 69eb40cd1b47..7b6eb0ad513b 100644
--- a/drivers/usb/dwc2/hcd.c
+++ b/drivers/usb/dwc2/hcd.c
@@ -3314,9 +3314,9 @@ host:
 	}
 }
 
-static void dwc2_wakeup_detected(unsigned long data)
+static void dwc2_wakeup_detected(struct timer_list *t)
 {
-	struct dwc2_hsotg *hsotg = (struct dwc2_hsotg *)data;
+	struct dwc2_hsotg *hsotg = from_timer(hsotg, t, wkp_timer);
 	u32 hprt0;
 
 	dev_dbg(hsotg->dev, "%s()\n", __func__);
@@ -5155,8 +5155,7 @@ int dwc2_hcd_init(struct dwc2_hsotg *hsotg)
 	}
 	INIT_WORK(&hsotg->wf_otg, dwc2_conn_id_status_change);
 
-	setup_timer(&hsotg->wkp_timer, dwc2_wakeup_detected,
-		    (unsigned long)hsotg);
+	timer_setup(&hsotg->wkp_timer, dwc2_wakeup_detected, 0);
 
 	/* Initialize the non-periodic schedule */
 	INIT_LIST_HEAD(&hsotg->non_periodic_sched_inactive);
diff --git a/drivers/usb/dwc2/hcd_queue.c b/drivers/usb/dwc2/hcd_queue.c
index f472de238ac2..fcd1676c7f0b 100644
--- a/drivers/usb/dwc2/hcd_queue.c
+++ b/drivers/usb/dwc2/hcd_queue.c
@@ -1275,9 +1275,9 @@ static void dwc2_do_unreserve(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh)
  *
  * @work: Pointer to a qh unreserve_work.
  */
-static void dwc2_unreserve_timer_fn(unsigned long data)
+static void dwc2_unreserve_timer_fn(struct timer_list *t)
 {
-	struct dwc2_qh *qh = (struct dwc2_qh *)data;
+	struct dwc2_qh *qh = from_timer(qh, t, unreserve_timer);
 	struct dwc2_hsotg *hsotg = qh->hsotg;
 	unsigned long flags;
 
@@ -1467,8 +1467,7 @@ static void dwc2_qh_init(struct dwc2_hsotg *hsotg, struct dwc2_qh *qh,
 
 	/* Initialize QH */
 	qh->hsotg = hsotg;
-	setup_timer(&qh->unreserve_timer, dwc2_unreserve_timer_fn,
-		    (unsigned long)qh);
+	timer_setup(&qh->unreserve_timer, dwc2_unreserve_timer_fn, 0);
 	qh->ep_type = ep_type;
 	qh->ep_is_in = ep_is_in;
 
diff --git a/drivers/usb/gadget/udc/at91_udc.c b/drivers/usb/gadget/udc/at91_udc.c
index bfe278294e88..ad743a8493be 100644
--- a/drivers/usb/gadget/udc/at91_udc.c
+++ b/drivers/usb/gadget/udc/at91_udc.c
@@ -1550,9 +1550,9 @@ static void at91_vbus_timer_work(struct work_struct *work)
 		mod_timer(&udc->vbus_timer, jiffies + VBUS_POLL_TIMEOUT);
 }
 
-static void at91_vbus_timer(unsigned long data)
+static void at91_vbus_timer(struct timer_list *t)
 {
-	struct at91_udc *udc = (struct at91_udc *)data;
+	struct at91_udc *udc = from_timer(udc, t, vbus_timer);
 
 	/*
 	 * If we are polling vbus it is likely that the gpio is on an
@@ -1918,8 +1918,7 @@ static int at91udc_probe(struct platform_device *pdev)
 
 		if (udc->board.vbus_polled) {
 			INIT_WORK(&udc->vbus_timer_work, at91_vbus_timer_work);
-			setup_timer(&udc->vbus_timer, at91_vbus_timer,
-				    (unsigned long)udc);
+			timer_setup(&udc->vbus_timer, at91_vbus_timer, 0);
 			mod_timer(&udc->vbus_timer,
 				  jiffies + VBUS_POLL_TIMEOUT);
 		} else {
diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c
index 4f1b1809472c..d0128f92ec5a 100644
--- a/drivers/usb/gadget/udc/dummy_hcd.c
+++ b/drivers/usb/gadget/udc/dummy_hcd.c
@@ -1771,9 +1771,9 @@ static int handle_control_request(struct dummy_hcd *dum_hcd, struct urb *urb,
 /* drive both sides of the transfers; looks like irq handlers to
  * both drivers except the callbacks aren't in_irq().
  */
-static void dummy_timer(unsigned long _dum_hcd)
+static void dummy_timer(struct timer_list *t)
 {
-	struct dummy_hcd	*dum_hcd = (struct dummy_hcd *) _dum_hcd;
+	struct dummy_hcd	*dum_hcd = from_timer(dum_hcd, t, timer);
 	struct dummy		*dum = dum_hcd->dum;
 	struct urbp		*urbp, *tmp;
 	unsigned long		flags;
@@ -2445,7 +2445,7 @@ static DEVICE_ATTR_RO(urbs);
 
 static int dummy_start_ss(struct dummy_hcd *dum_hcd)
 {
-	setup_timer(&dum_hcd->timer, dummy_timer, (unsigned long)dum_hcd);
+	timer_setup(&dum_hcd->timer, dummy_timer, 0);
 	dum_hcd->rh_state = DUMMY_RH_RUNNING;
 	dum_hcd->stream_en_ep = 0;
 	INIT_LIST_HEAD(&dum_hcd->urbp_list);
@@ -2474,7 +2474,7 @@ static int dummy_start(struct usb_hcd *hcd)
 		return dummy_start_ss(dum_hcd);
 
 	spin_lock_init(&dum_hcd->dum->lock);
-	setup_timer(&dum_hcd->timer, dummy_timer, (unsigned long)dum_hcd);
+	timer_setup(&dum_hcd->timer, dummy_timer, 0);
 	dum_hcd->rh_state = DUMMY_RH_RUNNING;
 
 	INIT_LIST_HEAD(&dum_hcd->urbp_list);
diff --git a/drivers/usb/gadget/udc/m66592-udc.c b/drivers/usb/gadget/udc/m66592-udc.c
index f19e6282a688..a8288df6aadf 100644
--- a/drivers/usb/gadget/udc/m66592-udc.c
+++ b/drivers/usb/gadget/udc/m66592-udc.c
@@ -1259,9 +1259,9 @@ static irqreturn_t m66592_irq(int irq, void *_m66592)
 	return IRQ_HANDLED;
 }
 
-static void m66592_timer(unsigned long _m66592)
+static void m66592_timer(struct timer_list *t)
 {
-	struct m66592 *m66592 = (struct m66592 *)_m66592;
+	struct m66592 *m66592 = from_timer(m66592, t, timer);
 	unsigned long flags;
 	u16 tmp;
 
@@ -1589,7 +1589,7 @@ static int m66592_probe(struct platform_device *pdev)
 	m66592->gadget.max_speed = USB_SPEED_HIGH;
 	m66592->gadget.name = udc_name;
 
-	setup_timer(&m66592->timer, m66592_timer, (unsigned long)m66592);
+	timer_setup(&m66592->timer, m66592_timer, 0);
 	m66592->reg = reg;
 
 	ret = request_irq(ires->start, m66592_irq, IRQF_SHARED,
diff --git a/drivers/usb/gadget/udc/omap_udc.c b/drivers/usb/gadget/udc/omap_udc.c
index fb8c4bff584c..dc35a54bad90 100644
--- a/drivers/usb/gadget/udc/omap_udc.c
+++ b/drivers/usb/gadget/udc/omap_udc.c
@@ -1854,9 +1854,9 @@ static irqreturn_t omap_udc_irq(int irq, void *_udc)
 #define PIO_OUT_TIMEOUT	(jiffies + HZ/3)
 #define HALF_FULL(f)	(!((f)&(UDC_NON_ISO_FIFO_FULL|UDC_NON_ISO_FIFO_EMPTY)))
 
-static void pio_out_timer(unsigned long _ep)
+static void pio_out_timer(struct timer_list *t)
 {
-	struct omap_ep	*ep = (void *) _ep;
+	struct omap_ep	*ep = from_timer(ep, t, timer);
 	unsigned long	flags;
 	u16		stat_flg;
 
@@ -2542,7 +2542,7 @@ omap_ep_setup(char *name, u8 addr, u8 type,
 		}
 		if (dbuf && addr)
 			epn_rxtx |= UDC_EPN_RX_DB;
-		setup_timer(&ep->timer, pio_out_timer, (unsigned long)ep);
+		timer_setup(&ep->timer, pio_out_timer, 0);
 	}
 	if (addr)
 		epn_rxtx |= UDC_EPN_RX_VALID;
diff --git a/drivers/usb/gadget/udc/pxa25x_udc.c b/drivers/usb/gadget/udc/pxa25x_udc.c
index 8f135d9fa245..0e3f5faa000e 100644
--- a/drivers/usb/gadget/udc/pxa25x_udc.c
+++ b/drivers/usb/gadget/udc/pxa25x_udc.c
@@ -1624,9 +1624,9 @@ static inline void clear_ep_state (struct pxa25x_udc *dev)
 		nuke(&dev->ep[i], -ECONNABORTED);
 }
 
-static void udc_watchdog(unsigned long _dev)
+static void udc_watchdog(struct timer_list *t)
 {
-	struct pxa25x_udc	*dev = (void *)_dev;
+	struct pxa25x_udc	*dev = from_timer(dev, t, timer);
 
 	local_irq_disable();
 	if (dev->ep0state == EP0_STALL
@@ -2413,7 +2413,7 @@ static int pxa25x_udc_probe(struct platform_device *pdev)
 		gpio_direction_output(dev->mach->gpio_pullup, 0);
 	}
 
-	setup_timer(&dev->timer, udc_watchdog, (unsigned long)dev);
+	timer_setup(&dev->timer, udc_watchdog, 0);
 
 	the_controller = dev;
 	platform_set_drvdata(pdev, dev);
diff --git a/drivers/usb/gadget/udc/r8a66597-udc.c b/drivers/usb/gadget/udc/r8a66597-udc.c
index 143122ed3c66..a3ecce62662b 100644
--- a/drivers/usb/gadget/udc/r8a66597-udc.c
+++ b/drivers/usb/gadget/udc/r8a66597-udc.c
@@ -1514,9 +1514,9 @@ static irqreturn_t r8a66597_irq(int irq, void *_r8a66597)
 	return IRQ_HANDLED;
 }
 
-static void r8a66597_timer(unsigned long _r8a66597)
+static void r8a66597_timer(struct timer_list *t)
 {
-	struct r8a66597 *r8a66597 = (struct r8a66597 *)_r8a66597;
+	struct r8a66597 *r8a66597 = from_timer(r8a66597, t, timer);
 	unsigned long flags;
 	u16 tmp;
 
@@ -1874,7 +1874,7 @@ static int r8a66597_probe(struct platform_device *pdev)
 	r8a66597->gadget.max_speed = USB_SPEED_HIGH;
 	r8a66597->gadget.name = udc_name;
 
-	setup_timer(&r8a66597->timer, r8a66597_timer, (unsigned long)r8a66597);
+	timer_setup(&r8a66597->timer, r8a66597_timer, 0);
 	r8a66597->reg = reg;
 
 	if (r8a66597->pdata->on_chip) {
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 10887e09e9bc..ee9676349333 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -80,7 +80,7 @@ static const char	hcd_name [] = "ohci_hcd";
 
 static void ohci_dump(struct ohci_hcd *ohci);
 static void ohci_stop(struct usb_hcd *hcd);
-static void io_watchdog_func(unsigned long _ohci);
+static void io_watchdog_func(struct timer_list *t);
 
 #include "ohci-hub.c"
 #include "ohci-dbg.c"
@@ -500,8 +500,7 @@ static int ohci_init (struct ohci_hcd *ohci)
 	if (ohci->hcca)
 		return 0;
 
-	setup_timer(&ohci->io_watchdog, io_watchdog_func,
-			(unsigned long) ohci);
+	timer_setup(&ohci->io_watchdog, io_watchdog_func, 0);
 
 	ohci->hcca = dma_alloc_coherent (hcd->self.controller,
 			sizeof(*ohci->hcca), &ohci->hcca_dma, GFP_KERNEL);
@@ -723,9 +722,9 @@ static int ohci_start(struct usb_hcd *hcd)
  * the unlink list.  As a result, URBs could never be dequeued and
  * endpoints could never be released.
  */
-static void io_watchdog_func(unsigned long _ohci)
+static void io_watchdog_func(struct timer_list *t)
 {
-	struct ohci_hcd	*ohci = (struct ohci_hcd *) _ohci;
+	struct ohci_hcd	*ohci = from_timer(ohci, t, io_watchdog);
 	bool		takeback_all_pending = false;
 	u32		status;
 	u32		head;
diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c
index 0bf7759aae78..c5e6e8d0b5ef 100644
--- a/drivers/usb/host/oxu210hp-hcd.c
+++ b/drivers/usb/host/oxu210hp-hcd.c
@@ -2539,9 +2539,9 @@ static irqreturn_t oxu_irq(struct usb_hcd *hcd)
 	return ret;
 }
 
-static void oxu_watchdog(unsigned long param)
+static void oxu_watchdog(struct timer_list *t)
 {
-	struct oxu_hcd	*oxu = (struct oxu_hcd *) param;
+	struct oxu_hcd	*oxu = from_timer(oxu, t, watchdog);
 	unsigned long flags;
 
 	spin_lock_irqsave(&oxu->lock, flags);
@@ -2577,7 +2577,7 @@ static int oxu_hcd_init(struct usb_hcd *hcd)
 
 	spin_lock_init(&oxu->lock);
 
-	setup_timer(&oxu->watchdog, oxu_watchdog, (unsigned long)oxu);
+	timer_setup(&oxu->watchdog, oxu_watchdog, 0);
 
 	/*
 	 * hw default: 1K periodic list heads, one per frame.
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index f3d9ba420a97..984892dd72f5 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -1798,9 +1798,9 @@ static void r8a66597_td_timer(struct timer_list *t)
 	spin_unlock_irqrestore(&r8a66597->lock, flags);
 }
 
-static void r8a66597_timer(unsigned long _r8a66597)
+static void r8a66597_timer(struct timer_list *t)
 {
-	struct r8a66597 *r8a66597 = (struct r8a66597 *)_r8a66597;
+	struct r8a66597 *r8a66597 = from_timer(r8a66597, t, rh_timer);
 	unsigned long flags;
 	int port;
 
@@ -2472,8 +2472,7 @@ static int r8a66597_probe(struct platform_device *pdev)
 		r8a66597->max_root_hub = 2;
 
 	spin_lock_init(&r8a66597->lock);
-	setup_timer(&r8a66597->rh_timer, r8a66597_timer,
-		    (unsigned long)r8a66597);
+	timer_setup(&r8a66597->rh_timer, r8a66597_timer, 0);
 	r8a66597->reg = reg;
 
 	/* make sure no interrupts are pending */
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index 601fb00603cc..fa88a903fa2e 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1119,9 +1119,9 @@ sl811h_hub_descriptor (
 }
 
 static void
-sl811h_timer(unsigned long _sl811)
+sl811h_timer(struct timer_list *t)
 {
-	struct sl811 	*sl811 = (void *) _sl811;
+	struct sl811 	*sl811 = from_timer(sl811, t, timer);
 	unsigned long	flags;
 	u8		irqstat;
 	u8		signaling = sl811->ctrl1 & SL11H_CTL1MASK_FORCE;
@@ -1692,7 +1692,7 @@ sl811h_probe(struct platform_device *dev)
 	spin_lock_init(&sl811->lock);
 	INIT_LIST_HEAD(&sl811->async);
 	sl811->board = dev_get_platdata(&dev->dev);
-	setup_timer(&sl811->timer, sl811h_timer, (unsigned long)sl811);
+	timer_setup(&sl811->timer, sl811h_timer, 0);
 	sl811->addr_reg = addr_reg;
 	sl811->data_reg = data_reg;
 
diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c
index babeefd84ffd..f5c90217777a 100644
--- a/drivers/usb/host/uhci-hcd.c
+++ b/drivers/usb/host/uhci-hcd.c
@@ -585,8 +585,7 @@ static int uhci_start(struct usb_hcd *hcd)
 		hcd->self.sg_tablesize = ~0;
 
 	spin_lock_init(&uhci->lock);
-	setup_timer(&uhci->fsbr_timer, uhci_fsbr_timeout,
-			(unsigned long) uhci);
+	timer_setup(&uhci->fsbr_timer, uhci_fsbr_timeout, 0);
 	INIT_LIST_HEAD(&uhci->idle_qh_list);
 	init_waitqueue_head(&uhci->waitqh);
 
diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c
index 49d4edc03cc2..d40438238938 100644
--- a/drivers/usb/host/uhci-q.c
+++ b/drivers/usb/host/uhci-q.c
@@ -90,9 +90,9 @@ static void uhci_urbp_wants_fsbr(struct uhci_hcd *uhci, struct urb_priv *urbp)
 	}
 }
 
-static void uhci_fsbr_timeout(unsigned long _uhci)
+static void uhci_fsbr_timeout(struct timer_list *t)
 {
-	struct uhci_hcd *uhci = (struct uhci_hcd *) _uhci;
+	struct uhci_hcd *uhci = from_timer(uhci, t, fsbr_timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&uhci->lock, flags);
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 327ba8b8a98b..2424d3020ca3 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -395,14 +395,14 @@ static inline void xhci_msix_sync_irqs(struct xhci_hcd *xhci)
 
 #endif
 
-static void compliance_mode_recovery(unsigned long arg)
+static void compliance_mode_recovery(struct timer_list *t)
 {
 	struct xhci_hcd *xhci;
 	struct usb_hcd *hcd;
 	u32 temp;
 	int i;
 
-	xhci = (struct xhci_hcd *)arg;
+	xhci = from_timer(xhci, t, comp_mode_recovery_timer);
 
 	for (i = 0; i < xhci->num_usb3_ports; i++) {
 		temp = readl(xhci->usb3_ports[i]);
@@ -443,8 +443,8 @@ static void compliance_mode_recovery(unsigned long arg)
 static void compliance_mode_recovery_timer_init(struct xhci_hcd *xhci)
 {
 	xhci->port_status_u0 = 0;
-	setup_timer(&xhci->comp_mode_recovery_timer,
-		    compliance_mode_recovery, (unsigned long)xhci);
+	timer_setup(&xhci->comp_mode_recovery_timer, compliance_mode_recovery,
+		    0);
 	xhci->comp_mode_recovery_timer.expires = jiffies +
 			msecs_to_jiffies(COMP_MODE_RCVRY_MSECS);
 
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index a859c2d33c29..fdceb46d9fc6 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -555,9 +555,9 @@ static void mos7840_set_led_sync(struct usb_serial_port *port, __u16 reg,
 			val, reg, NULL, 0, MOS_WDR_TIMEOUT);
 }
 
-static void mos7840_led_off(unsigned long arg)
+static void mos7840_led_off(struct timer_list *t)
 {
-	struct moschip_port *mcs = (struct moschip_port *) arg;
+	struct moschip_port *mcs = from_timer(mcs, t, led_timer1);
 
 	/* Turn off LED */
 	mos7840_set_led_async(mcs, 0x0300, MODEM_CONTROL_REGISTER);
@@ -565,9 +565,9 @@ static void mos7840_led_off(unsigned long arg)
 				jiffies + msecs_to_jiffies(LED_OFF_MS));
 }
 
-static void mos7840_led_flag_off(unsigned long arg)
+static void mos7840_led_flag_off(struct timer_list *t)
 {
-	struct moschip_port *mcs = (struct moschip_port *) arg;
+	struct moschip_port *mcs = from_timer(mcs, t, led_timer2);
 
 	clear_bit_unlock(MOS7840_FLAG_LED_BUSY, &mcs->flags);
 }
@@ -2289,12 +2289,11 @@ static int mos7840_port_probe(struct usb_serial_port *port)
 			goto error;
 		}
 
-		setup_timer(&mos7840_port->led_timer1, mos7840_led_off,
-			    (unsigned long)mos7840_port);
+		timer_setup(&mos7840_port->led_timer1, mos7840_led_off, 0);
 		mos7840_port->led_timer1.expires =
 			jiffies + msecs_to_jiffies(LED_ON_MS);
-		setup_timer(&mos7840_port->led_timer2, mos7840_led_flag_off,
-			    (unsigned long)mos7840_port);
+		timer_setup(&mos7840_port->led_timer2, mos7840_led_flag_off,
+			    0);
 		mos7840_port->led_timer2.expires =
 			jiffies + msecs_to_jiffies(LED_OFF_MS);
 
diff --git a/drivers/usb/storage/realtek_cr.c b/drivers/usb/storage/realtek_cr.c
index 48e2e32c97e8..31b024441938 100644
--- a/drivers/usb/storage/realtek_cr.c
+++ b/drivers/usb/storage/realtek_cr.c
@@ -751,9 +751,9 @@ static void rts51x_modi_suspend_timer(struct rts51x_chip *chip)
 	mod_timer(&chip->rts51x_suspend_timer, chip->timer_expires);
 }
 
-static void rts51x_suspend_timer_fn(unsigned long data)
+static void rts51x_suspend_timer_fn(struct timer_list *t)
 {
-	struct rts51x_chip *chip = (struct rts51x_chip *)data;
+	struct rts51x_chip *chip = from_timer(chip, t, rts51x_suspend_timer);
 	struct us_data *us = chip->us;
 
 	switch (rts51x_get_stat(chip)) {
@@ -917,8 +917,7 @@ static int realtek_cr_autosuspend_setup(struct us_data *us)
 	us->proto_handler = rts51x_invoke_transport;
 
 	chip->timer_expires = 0;
-	setup_timer(&chip->rts51x_suspend_timer, rts51x_suspend_timer_fn,
-			(unsigned long)chip);
+	timer_setup(&chip->rts51x_suspend_timer, rts51x_suspend_timer_fn, 0);
 	fw5895_init(us);
 
 	/* enable autosuspend function of the usb device */
diff --git a/drivers/uwb/drp.c b/drivers/uwb/drp.c
index 38d0504a1bbc..625f706b8160 100644
--- a/drivers/uwb/drp.c
+++ b/drivers/uwb/drp.c
@@ -603,9 +603,9 @@ static void uwb_cnflt_update_work(struct work_struct *work)
 	mutex_unlock(&rc->rsvs_mutex);
 }
 
-static void uwb_cnflt_timer(unsigned long arg)
+static void uwb_cnflt_timer(struct timer_list *t)
 {
-	struct uwb_cnflt_alien *cnflt = (struct uwb_cnflt_alien *)arg;
+	struct uwb_cnflt_alien *cnflt = from_timer(cnflt, t, timer);
 
 	queue_work(cnflt->rc->rsv_workq, &cnflt->cnflt_update_work);
 }
@@ -642,7 +642,7 @@ static void uwb_drp_handle_alien_drp(struct uwb_rc *rc, struct uwb_ie_drp *drp_i
 	}
 
 	INIT_LIST_HEAD(&cnflt->rc_node);
-	setup_timer(&cnflt->timer, uwb_cnflt_timer, (unsigned long)cnflt);
+	timer_setup(&cnflt->timer, uwb_cnflt_timer, 0);
 
 	cnflt->rc = rc;
 	INIT_WORK(&cnflt->cnflt_update_work, uwb_cnflt_update_work);
diff --git a/drivers/uwb/neh.c b/drivers/uwb/neh.c
index 36b5cb62c15d..fbdca728bd9f 100644
--- a/drivers/uwb/neh.c
+++ b/drivers/uwb/neh.c
@@ -115,7 +115,7 @@ struct uwb_rc_neh {
 	struct list_head list_node;
 };
 
-static void uwb_rc_neh_timer(unsigned long arg);
+static void uwb_rc_neh_timer(struct timer_list *t);
 
 static void uwb_rc_neh_release(struct kref *kref)
 {
@@ -223,7 +223,7 @@ struct uwb_rc_neh *uwb_rc_neh_add(struct uwb_rc *rc, struct uwb_rccb *cmd,
 
 	kref_init(&neh->kref);
 	INIT_LIST_HEAD(&neh->list_node);
-	setup_timer(&neh->timer, uwb_rc_neh_timer, (unsigned long)neh);
+	timer_setup(&neh->timer, uwb_rc_neh_timer, 0);
 
 	neh->rc = rc;
 	neh->evt_type = expected_type;
@@ -565,9 +565,9 @@ void uwb_rc_neh_error(struct uwb_rc *rc, int error)
 EXPORT_SYMBOL_GPL(uwb_rc_neh_error);
 
 
-static void uwb_rc_neh_timer(unsigned long arg)
+static void uwb_rc_neh_timer(struct timer_list *t)
 {
-	struct uwb_rc_neh *neh = (struct uwb_rc_neh *)arg;
+	struct uwb_rc_neh *neh = from_timer(neh, t, timer);
 	struct uwb_rc *rc = neh->rc;
 	unsigned long flags;
 
diff --git a/drivers/uwb/rsv.c b/drivers/uwb/rsv.c
index f5e27247a38f..fe25a8cc6fa1 100644
--- a/drivers/uwb/rsv.c
+++ b/drivers/uwb/rsv.c
@@ -23,7 +23,7 @@
 
 #include "uwb-internal.h"
 
-static void uwb_rsv_timer(unsigned long arg);
+static void uwb_rsv_timer(struct timer_list *t);
 
 static const char *rsv_states[] = {
 	[UWB_RSV_STATE_NONE]                 = "none            ",
@@ -198,9 +198,9 @@ static void uwb_rsv_put_stream(struct uwb_rsv *rsv)
 	dev_dbg(dev, "put stream %d\n", rsv->stream);
 }
 
-void uwb_rsv_backoff_win_timer(unsigned long arg)
+void uwb_rsv_backoff_win_timer(struct timer_list *t)
 {
-	struct uwb_drp_backoff_win *bow = (struct uwb_drp_backoff_win *)arg;
+	struct uwb_drp_backoff_win *bow = from_timer(bow, t, timer);
 	struct uwb_rc *rc = container_of(bow, struct uwb_rc, bow);
 	struct device *dev = &rc->uwb_dev.dev;
 
@@ -470,7 +470,7 @@ static struct uwb_rsv *uwb_rsv_alloc(struct uwb_rc *rc)
 	INIT_LIST_HEAD(&rsv->rc_node);
 	INIT_LIST_HEAD(&rsv->pal_node);
 	kref_init(&rsv->kref);
-	setup_timer(&rsv->timer, uwb_rsv_timer, (unsigned long)rsv);
+	timer_setup(&rsv->timer, uwb_rsv_timer, 0);
 
 	rsv->rc = rc;
 	INIT_WORK(&rsv->handle_timeout_work, uwb_rsv_handle_timeout_work);
@@ -939,9 +939,9 @@ static void uwb_rsv_alien_bp_work(struct work_struct *work)
 	mutex_unlock(&rc->rsvs_mutex);
 }
 
-static void uwb_rsv_timer(unsigned long arg)
+static void uwb_rsv_timer(struct timer_list *t)
 {
-	struct uwb_rsv *rsv = (struct uwb_rsv *)arg;
+	struct uwb_rsv *rsv = from_timer(rsv, t, timer);
 
 	queue_work(rsv->rc->rsv_workq, &rsv->handle_timeout_work);
 }
@@ -987,8 +987,7 @@ void uwb_rsv_init(struct uwb_rc *rc)
 	rc->bow.can_reserve_extra_mases = true;
 	rc->bow.total_expired = 0;
 	rc->bow.window = UWB_DRP_BACKOFF_WIN_MIN >> 1;
-	setup_timer(&rc->bow.timer, uwb_rsv_backoff_win_timer,
-			(unsigned long)&rc->bow);
+	timer_setup(&rc->bow.timer, uwb_rsv_backoff_win_timer, 0);
 
 	bitmap_complement(rc->uwb_dev.streams, rc->uwb_dev.streams, UWB_NUM_STREAMS);
 }
diff --git a/drivers/uwb/uwb-internal.h b/drivers/uwb/uwb-internal.h
index 353c0555a1f5..91326ce093a7 100644
--- a/drivers/uwb/uwb-internal.h
+++ b/drivers/uwb/uwb-internal.h
@@ -329,7 +329,7 @@ void uwb_rsv_put(struct uwb_rsv *rsv);
 bool uwb_rsv_has_two_drp_ies(struct uwb_rsv *rsv);
 void uwb_rsv_dump(char *text, struct uwb_rsv *rsv);
 int uwb_rsv_try_move(struct uwb_rsv *rsv, struct uwb_mas_bm *available);
-void uwb_rsv_backoff_win_timer(unsigned long arg);
+void uwb_rsv_backoff_win_timer(struct timer_list *t);
 void uwb_rsv_backoff_win_increment(struct uwb_rc *rc);
 int uwb_rsv_status(struct uwb_rsv *rsv);
 int uwb_rsv_companion_status(struct uwb_rsv *rsv);
diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index 7e6acaf3ece4..88c05d0448b2 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -120,9 +120,9 @@ static inline void at91_wdt_reset(struct at91wdt *wdt)
 /*
  * Timer tick
  */
-static void at91_ping(unsigned long data)
+static void at91_ping(struct timer_list *t)
 {
-	struct at91wdt *wdt = (struct at91wdt *)data;
+	struct at91wdt *wdt = from_timer(wdt, t, timer);
 	if (time_before(jiffies, wdt->next_heartbeat) ||
 	    !watchdog_active(&wdt->wdd)) {
 		at91_wdt_reset(wdt);
@@ -222,7 +222,7 @@ static int at91_wdt_init(struct platform_device *pdev, struct at91wdt *wdt)
 			 "watchdog already configured differently (mr = %x expecting %x)\n",
 			 tmp & wdt->mr_mask, wdt->mr & wdt->mr_mask);
 
-	setup_timer(&wdt->timer, at91_ping, (unsigned long)wdt);
+	timer_setup(&wdt->timer, at91_ping, 0);
 
 	/*
 	 * Use min_heartbeat the first time to avoid spurious watchdog reset:
diff --git a/drivers/watchdog/bcm47xx_wdt.c b/drivers/watchdog/bcm47xx_wdt.c
index 236582809336..f41b756d6dd5 100644
--- a/drivers/watchdog/bcm47xx_wdt.c
+++ b/drivers/watchdog/bcm47xx_wdt.c
@@ -106,9 +106,9 @@ static const struct watchdog_ops bcm47xx_wdt_hard_ops = {
 	.restart        = bcm47xx_wdt_restart,
 };
 
-static void bcm47xx_wdt_soft_timer_tick(unsigned long data)
+static void bcm47xx_wdt_soft_timer_tick(struct timer_list *t)
 {
-	struct bcm47xx_wdt *wdt = (struct bcm47xx_wdt *)data;
+	struct bcm47xx_wdt *wdt = from_timer(wdt, t, soft_timer);
 	u32 next_tick = min(wdt->wdd.timeout * 1000, wdt->max_timer_ms);
 
 	if (!atomic_dec_and_test(&wdt->soft_ticks)) {
@@ -133,7 +133,7 @@ static int bcm47xx_wdt_soft_start(struct watchdog_device *wdd)
 	struct bcm47xx_wdt *wdt = bcm47xx_wdt_get(wdd);
 
 	bcm47xx_wdt_soft_keepalive(wdd);
-	bcm47xx_wdt_soft_timer_tick((unsigned long)wdt);
+	bcm47xx_wdt_soft_timer_tick(&wdt->soft_timer);
 
 	return 0;
 }
@@ -190,8 +190,7 @@ static int bcm47xx_wdt_probe(struct platform_device *pdev)
 
 	if (soft) {
 		wdt->wdd.ops = &bcm47xx_wdt_soft_ops;
-		setup_timer(&wdt->soft_timer, bcm47xx_wdt_soft_timer_tick,
-			    (long unsigned int)wdt);
+		timer_setup(&wdt->soft_timer, bcm47xx_wdt_soft_timer_tick, 0);
 	} else {
 		wdt->wdd.ops = &bcm47xx_wdt_hard_ops;
 	}
diff --git a/drivers/watchdog/bcm63xx_wdt.c b/drivers/watchdog/bcm63xx_wdt.c
index ab26fd90729e..8555afc70f9b 100644
--- a/drivers/watchdog/bcm63xx_wdt.c
+++ b/drivers/watchdog/bcm63xx_wdt.c
@@ -77,7 +77,7 @@ static void bcm63xx_wdt_isr(void *data)
 	die(PFX " fire", regs);
 }
 
-static void bcm63xx_timer_tick(unsigned long unused)
+static void bcm63xx_timer_tick(struct timer_list *unused)
 {
 	if (!atomic_dec_and_test(&bcm63xx_wdt_device.ticks)) {
 		bcm63xx_wdt_hw_start();
@@ -240,7 +240,7 @@ static int bcm63xx_wdt_probe(struct platform_device *pdev)
 	int ret;
 	struct resource *r;
 
-	setup_timer(&bcm63xx_wdt_device.timer, bcm63xx_timer_tick, 0L);
+	timer_setup(&bcm63xx_wdt_device.timer, bcm63xx_timer_tick, 0);
 
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!r) {
diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c
index 6c3f78e45c26..6cfb102c397c 100644
--- a/drivers/watchdog/cpu5wdt.c
+++ b/drivers/watchdog/cpu5wdt.c
@@ -69,7 +69,7 @@ static struct {
 
 /* generic helper functions */
 
-static void cpu5wdt_trigger(unsigned long unused)
+static void cpu5wdt_trigger(struct timer_list *unused)
 {
 	if (verbose > 2)
 		pr_debug("trigger at %i ticks\n", ticks);
@@ -224,7 +224,7 @@ static int cpu5wdt_init(void)
 
 	init_completion(&cpu5wdt_device.stop);
 	cpu5wdt_device.queue = 0;
-	setup_timer(&cpu5wdt_device.timer, cpu5wdt_trigger, 0);
+	timer_setup(&cpu5wdt_device.timer, cpu5wdt_trigger, 0);
 	cpu5wdt_device.default_ticks = ticks;
 
 	if (!request_region(port, CPU5WDT_EXTENT, PFX)) {
diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
index 366e5c7e650b..6610e9217dbc 100644
--- a/drivers/watchdog/mpc8xxx_wdt.c
+++ b/drivers/watchdog/mpc8xxx_wdt.c
@@ -80,9 +80,9 @@ static void mpc8xxx_wdt_keepalive(struct mpc8xxx_wdt_ddata *ddata)
 	spin_unlock(&ddata->lock);
 }
 
-static void mpc8xxx_wdt_timer_ping(unsigned long arg)
+static void mpc8xxx_wdt_timer_ping(struct timer_list *t)
 {
-	struct mpc8xxx_wdt_ddata *ddata = (void *)arg;
+	struct mpc8xxx_wdt_ddata *ddata = from_timer(ddata, t, timer);
 
 	mpc8xxx_wdt_keepalive(ddata);
 	/* We're pinging it twice faster than needed, just to be sure. */
@@ -173,8 +173,7 @@ static int mpc8xxx_wdt_probe(struct platform_device *ofdev)
 	}
 
 	spin_lock_init(&ddata->lock);
-	setup_timer(&ddata->timer, mpc8xxx_wdt_timer_ping,
-		    (unsigned long)ddata);
+	timer_setup(&ddata->timer, mpc8xxx_wdt_timer_ping, 0);
 
 	ddata->wdd.info = &mpc8xxx_wdt_info,
 	ddata->wdd.ops = &mpc8xxx_wdt_ops,
diff --git a/drivers/watchdog/mtx-1_wdt.c b/drivers/watchdog/mtx-1_wdt.c
index ff27c4ac96e4..ca360d204548 100644
--- a/drivers/watchdog/mtx-1_wdt.c
+++ b/drivers/watchdog/mtx-1_wdt.c
@@ -68,7 +68,7 @@ static struct {
 	unsigned int gstate;
 } mtx1_wdt_device;
 
-static void mtx1_wdt_trigger(unsigned long unused)
+static void mtx1_wdt_trigger(struct timer_list *unused)
 {
 	spin_lock(&mtx1_wdt_device.lock);
 	if (mtx1_wdt_device.running)
@@ -219,7 +219,7 @@ static int mtx1_wdt_probe(struct platform_device *pdev)
 	init_completion(&mtx1_wdt_device.stop);
 	mtx1_wdt_device.queue = 0;
 	clear_bit(0, &mtx1_wdt_device.inuse);
-	setup_timer(&mtx1_wdt_device.timer, mtx1_wdt_trigger, 0L);
+	timer_setup(&mtx1_wdt_device.timer, mtx1_wdt_trigger, 0);
 	mtx1_wdt_device.default_ticks = ticks;
 
 	ret = misc_register(&mtx1_wdt_misc);
diff --git a/drivers/watchdog/nuc900_wdt.c b/drivers/watchdog/nuc900_wdt.c
index d5bed78c4d9f..830bd04ff911 100644
--- a/drivers/watchdog/nuc900_wdt.c
+++ b/drivers/watchdog/nuc900_wdt.c
@@ -216,7 +216,7 @@ static ssize_t nuc900_wdt_write(struct file *file, const char __user *data,
 	return len;
 }
 
-static void nuc900_wdt_timer_ping(unsigned long data)
+static void nuc900_wdt_timer_ping(struct timer_list *unused)
 {
 	if (time_before(jiffies, nuc900_wdt->next_heartbeat)) {
 		nuc900_wdt_keepalive();
@@ -267,7 +267,7 @@ static int nuc900wdt_probe(struct platform_device *pdev)
 
 	clk_enable(nuc900_wdt->wdt_clock);
 
-	setup_timer(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0);
+	timer_setup(&nuc900_wdt->timer, nuc900_wdt_timer_ping, 0);
 
 	ret = misc_register(&nuc900wdt_miscdev);
 	if (ret) {
diff --git a/drivers/watchdog/pcwd.c b/drivers/watchdog/pcwd.c
index 3ad5206d7935..b72ce68eacd3 100644
--- a/drivers/watchdog/pcwd.c
+++ b/drivers/watchdog/pcwd.c
@@ -367,7 +367,7 @@ static void pcwd_show_card_info(void)
 		pr_info("No previous trip detected - Cold boot or reset\n");
 }
 
-static void pcwd_timer_ping(unsigned long data)
+static void pcwd_timer_ping(struct timer_list *unused)
 {
 	int wdrst_stat;
 
@@ -893,7 +893,7 @@ static int pcwd_isa_probe(struct device *dev, unsigned int id)
 	/* clear the "card caused reboot" flag */
 	pcwd_clear_status();
 
-	setup_timer(&pcwd_private.timer, pcwd_timer_ping, 0);
+	timer_setup(&pcwd_private.timer, pcwd_timer_ping, 0);
 
 	/*  Disable the board  */
 	pcwd_stop();
diff --git a/drivers/watchdog/pika_wdt.c b/drivers/watchdog/pika_wdt.c
index e35cf5e87907..e0a6f8c0f03c 100644
--- a/drivers/watchdog/pika_wdt.c
+++ b/drivers/watchdog/pika_wdt.c
@@ -85,7 +85,7 @@ static inline void pikawdt_reset(void)
 /*
  * Timer tick
  */
-static void pikawdt_ping(unsigned long data)
+static void pikawdt_ping(struct timer_list *unused)
 {
 	if (time_before(jiffies, pikawdt_private.next_heartbeat) ||
 			(!nowayout && !pikawdt_private.open)) {
@@ -269,7 +269,7 @@ static int __init pikawdt_init(void)
 
 	iounmap(fpga);
 
-	setup_timer(&pikawdt_private.timer, pikawdt_ping, 0);
+	timer_setup(&pikawdt_private.timer, pikawdt_ping, 0);
 
 	ret = misc_register(&pikawdt_miscdev);
 	if (ret) {
diff --git a/drivers/watchdog/rdc321x_wdt.c b/drivers/watchdog/rdc321x_wdt.c
index 47a8f1b1087d..a281aa84bfb1 100644
--- a/drivers/watchdog/rdc321x_wdt.c
+++ b/drivers/watchdog/rdc321x_wdt.c
@@ -67,7 +67,7 @@ static struct {
 
 /* generic helper functions */
 
-static void rdc321x_wdt_trigger(unsigned long unused)
+static void rdc321x_wdt_trigger(struct timer_list *unused)
 {
 	unsigned long flags;
 	u32 val;
@@ -262,7 +262,7 @@ static int rdc321x_wdt_probe(struct platform_device *pdev)
 
 	clear_bit(0, &rdc321x_wdt_device.inuse);
 
-	setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
+	timer_setup(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
 
 	rdc321x_wdt_device.default_ticks = ticks;
 
diff --git a/drivers/watchdog/shwdt.c b/drivers/watchdog/shwdt.c
index 517a733175ef..a7d6425db807 100644
--- a/drivers/watchdog/shwdt.c
+++ b/drivers/watchdog/shwdt.c
@@ -175,9 +175,9 @@ static int sh_wdt_set_heartbeat(struct watchdog_device *wdt_dev, unsigned t)
 	return 0;
 }
 
-static void sh_wdt_ping(unsigned long data)
+static void sh_wdt_ping(struct timer_list *t)
 {
-	struct sh_wdt *wdt = (struct sh_wdt *)data;
+	struct sh_wdt *wdt = from_timer(wdt, t, timer);
 	unsigned long flags;
 
 	spin_lock_irqsave(&wdt->lock, flags);
@@ -275,7 +275,7 @@ static int sh_wdt_probe(struct platform_device *pdev)
 		return rc;
 	}
 
-	setup_timer(&wdt->timer, sh_wdt_ping, (unsigned long)wdt);
+	timer_setup(&wdt->timer, sh_wdt_ping, 0);
 	wdt->timer.expires	= next_ping_period(clock_division_ratio);
 
 	dev_info(&pdev->dev, "initialized.\n");
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8d779227370a..bebe59feca58 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -140,7 +140,7 @@ static void o2net_rx_until_empty(struct work_struct *work);
 static void o2net_shutdown_sc(struct work_struct *work);
 static void o2net_listen_data_ready(struct sock *sk);
 static void o2net_sc_send_keep_req(struct work_struct *work);
-static void o2net_idle_timer(unsigned long data);
+static void o2net_idle_timer(struct timer_list *t);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
@@ -450,8 +450,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
 	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-	setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
-		    (unsigned long)sc);
+	timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0);
 
 	sclog(sc, "alloced\n");
 
@@ -1517,9 +1516,9 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
 /* socket shutdown does a del_timer_sync against this as it tears down.
  * we can't start this timer until we've got to the point in sc buildup
  * where shutdown is going to be involved */
-static void o2net_idle_timer(unsigned long data)
+static void o2net_idle_timer(struct timer_list *t)
 {
-	struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
+	struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout);
 	struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
 #ifdef CONFIG_DEBUG_FS
 	unsigned long msecs = ktime_to_ms(ktime_get()) -
diff --git a/kernel/padata.c b/kernel/padata.c
index f262c9a4e70a..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -288,9 +288,9 @@ static void invoke_padata_reorder(struct work_struct *work)
 	local_bh_enable();
 }
 
-static void padata_reorder_timer(unsigned long arg)
+static void padata_reorder_timer(struct timer_list *t)
 {
-	struct parallel_data *pd = (struct parallel_data *)arg;
+	struct parallel_data *pd = from_timer(pd, t, timer);
 	unsigned int weight;
 	int target_cpu, cpu;
 
@@ -485,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
-	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+	timer_setup(&pd->timer, padata_reorder_timer, 0);
 	atomic_set(&pd->seq_nr, -1);
 	atomic_set(&pd->reorder_objects, 0);
 	atomic_set(&pd->refcnt, 0);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5b51d5ba2a85..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
 {
 	struct clocksource *cs;
 	u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,7 +290,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
-	setup_timer(&watchdog_timer, clocksource_watchdog, 0UL);
+	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
diff --git a/net/802/garp.c b/net/802/garp.c
index 2dac647ff420..7f50d47470bd 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -401,9 +401,9 @@ static void garp_join_timer_arm(struct garp_applicant *app)
 	mod_timer(&app->join_timer, jiffies + delay);
 }
 
-static void garp_join_timer(unsigned long data)
+static void garp_join_timer(struct timer_list *t)
 {
-	struct garp_applicant *app = (struct garp_applicant *)data;
+	struct garp_applicant *app = from_timer(app, t, join_timer);
 
 	spin_lock(&app->lock);
 	garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
@@ -584,7 +584,7 @@ int garp_init_applicant(struct net_device *dev, struct garp_application *appl)
 	spin_lock_init(&app->lock);
 	skb_queue_head_init(&app->queue);
 	rcu_assign_pointer(dev->garp_port->applicants[appl->type], app);
-	setup_timer(&app->join_timer, garp_join_timer, (unsigned long)app);
+	timer_setup(&app->join_timer, garp_join_timer, 0);
 	garp_join_timer_arm(app);
 	return 0;
 
diff --git a/net/802/mrp.c b/net/802/mrp.c
index be4dd3165347..a808dd5bbb27 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -586,9 +586,9 @@ static void mrp_join_timer_arm(struct mrp_applicant *app)
 	mod_timer(&app->join_timer, jiffies + delay);
 }
 
-static void mrp_join_timer(unsigned long data)
+static void mrp_join_timer(struct timer_list *t)
 {
-	struct mrp_applicant *app = (struct mrp_applicant *)data;
+	struct mrp_applicant *app = from_timer(app, t, join_timer);
 
 	spin_lock(&app->lock);
 	mrp_mad_event(app, MRP_EVENT_TX);
@@ -605,9 +605,9 @@ static void mrp_periodic_timer_arm(struct mrp_applicant *app)
 		  jiffies + msecs_to_jiffies(mrp_periodic_time));
 }
 
-static void mrp_periodic_timer(unsigned long data)
+static void mrp_periodic_timer(struct timer_list *t)
 {
-	struct mrp_applicant *app = (struct mrp_applicant *)data;
+	struct mrp_applicant *app = from_timer(app, t, periodic_timer);
 
 	spin_lock(&app->lock);
 	mrp_mad_event(app, MRP_EVENT_PERIODIC);
@@ -865,10 +865,9 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
 	spin_lock_init(&app->lock);
 	skb_queue_head_init(&app->queue);
 	rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
-	setup_timer(&app->join_timer, mrp_join_timer, (unsigned long)app);
+	timer_setup(&app->join_timer, mrp_join_timer, 0);
 	mrp_join_timer_arm(app);
-	setup_timer(&app->periodic_timer, mrp_periodic_timer,
-		    (unsigned long)app);
+	timer_setup(&app->periodic_timer, mrp_periodic_timer, 0);
 	mrp_periodic_timer_arm(app);
 	return 0;
 
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 8ad3ec2610b6..309d7dbb36e8 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -310,7 +310,7 @@ static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev)
 }
 
 /* Handle the timer event */
-static void aarp_expire_timeout(unsigned long unused)
+static void aarp_expire_timeout(struct timer_list *unused)
 {
 	int ct;
 
@@ -884,7 +884,7 @@ void __init aarp_proto_init(void)
 	aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
 	if (!aarp_dl)
 		printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
-	setup_timer(&aarp_timer, aarp_expire_timeout, 0);
+	timer_setup(&aarp_timer, aarp_expire_timeout, 0);
 	aarp_timer.expires  = jiffies + sysctl_aarp_expiry_time;
 	add_timer(&aarp_timer);
 	register_netdevice_notifier(&aarp_notifier);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 5d035c1f1156..03a9fc0771c0 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -158,9 +158,9 @@ found:
 	return s;
 }
 
-static void atalk_destroy_timer(unsigned long data)
+static void atalk_destroy_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)data;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 
 	if (sk_has_allocations(sk)) {
 		sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
@@ -175,8 +175,7 @@ static inline void atalk_destroy_socket(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 
 	if (sk_has_allocations(sk)) {
-		setup_timer(&sk->sk_timer, atalk_destroy_timer,
-				(unsigned long)sk);
+		timer_setup(&sk->sk_timer, atalk_destroy_timer, 0);
 		sk->sk_timer.expires	= jiffies + SOCK_DESTROY_TIME;
 		add_timer(&sk->sk_timer);
 	} else
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 4b90033f35a8..15cd2139381e 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -488,9 +488,9 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
  * Switch to Slow Start, set the ss_threshold to half of the current cwnd and
  * reset the cwnd to 3*MSS
  */
-static void batadv_tp_sender_timeout(unsigned long arg)
+static void batadv_tp_sender_timeout(struct timer_list *t)
 {
-	struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+	struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
 	struct batadv_priv *bat_priv = tp_vars->bat_priv;
 
 	if (atomic_read(&tp_vars->sending) == 0)
@@ -1020,8 +1020,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
 	atomic64_set(&tp_vars->tot_sent, 0);
 
 	kref_get(&tp_vars->refcount);
-	setup_timer(&tp_vars->timer, batadv_tp_sender_timeout,
-		    (unsigned long)tp_vars);
+	timer_setup(&tp_vars->timer, batadv_tp_sender_timeout, 0);
 
 	tp_vars->bat_priv = bat_priv;
 	tp_vars->start_time = jiffies;
@@ -1109,9 +1108,9 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
  *  reached without received ack
  * @arg: address of the related tp_vars
  */
-static void batadv_tp_receiver_shutdown(unsigned long arg)
+static void batadv_tp_receiver_shutdown(struct timer_list *t)
 {
-	struct batadv_tp_vars *tp_vars = (struct batadv_tp_vars *)arg;
+	struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
 	struct batadv_tp_unacked *un, *safe;
 	struct batadv_priv *bat_priv;
 
@@ -1373,8 +1372,7 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv,
 	hlist_add_head_rcu(&tp_vars->list, &bat_priv->tp_list);
 
 	kref_get(&tp_vars->refcount);
-	setup_timer(&tp_vars->timer, batadv_tp_receiver_shutdown,
-		    (unsigned long)tp_vars);
+	timer_setup(&tp_vars->timer, batadv_tp_receiver_shutdown, 0);
 
 	batadv_tp_reset_receiver_timer(tp_vars);
 
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 8112893037bd..f2cec70d520c 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -398,9 +398,9 @@ static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum,
 	}
 }
 
-static void hidp_idle_timeout(unsigned long arg)
+static void hidp_idle_timeout(struct timer_list *t)
 {
-	struct hidp_session *session = (struct hidp_session *) arg;
+	struct hidp_session *session = from_timer(session, t, timer);
 
 	/* The HIDP user-space API only contains calls to add and remove
 	 * devices. There is no way to forward events of any kind. Therefore,
@@ -944,8 +944,7 @@ static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
 
 	/* device management */
 	INIT_WORK(&session->dev_init, hidp_session_dev_work);
-	setup_timer(&session->timer, hidp_idle_timeout,
-		    (unsigned long)session);
+	timer_setup(&session->timer, hidp_idle_timeout, 0);
 
 	/* session data */
 	mutex_init(&session->report_mutex);
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 4a0b41d75c84..b98225d65e87 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -233,9 +233,9 @@ static int rfcomm_check_security(struct rfcomm_dlc *d)
 				 d->out);
 }
 
-static void rfcomm_session_timeout(unsigned long arg)
+static void rfcomm_session_timeout(struct timer_list *t)
 {
-	struct rfcomm_session *s = (void *) arg;
+	struct rfcomm_session *s = from_timer(s, t, timer);
 
 	BT_DBG("session %p state %ld", s, s->state);
 
@@ -258,9 +258,9 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s)
 }
 
 /* ---- RFCOMM DLCs ---- */
-static void rfcomm_dlc_timeout(unsigned long arg)
+static void rfcomm_dlc_timeout(struct timer_list *t)
 {
-	struct rfcomm_dlc *d = (void *) arg;
+	struct rfcomm_dlc *d = from_timer(d, t, timer);
 
 	BT_DBG("dlc %p state %ld", d, d->state);
 
@@ -307,7 +307,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
 	if (!d)
 		return NULL;
 
-	setup_timer(&d->timer, rfcomm_dlc_timeout, (unsigned long)d);
+	timer_setup(&d->timer, rfcomm_dlc_timeout, 0);
 
 	skb_queue_head_init(&d->tx_queue);
 	mutex_init(&d->lock);
@@ -650,7 +650,7 @@ static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state)
 
 	BT_DBG("session %p sock %p", s, sock);
 
-	setup_timer(&s->timer, rfcomm_session_timeout, (unsigned long) s);
+	timer_setup(&s->timer, rfcomm_session_timeout, 0);
 
 	INIT_LIST_HEAD(&s->dlcs);
 	s->state = state;
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 795e920a3281..08df57665e1f 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -73,9 +73,9 @@ struct sco_pinfo {
 #define SCO_CONN_TIMEOUT	(HZ * 40)
 #define SCO_DISCONN_TIMEOUT	(HZ * 2)
 
-static void sco_sock_timeout(unsigned long arg)
+static void sco_sock_timeout(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)arg;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 
 	BT_DBG("sock %p state %d", sk, sk->sk_state);
 
@@ -487,7 +487,7 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock,
 
 	sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
 
-	setup_timer(&sk->sk_timer, sco_sock_timeout, (unsigned long)sk);
+	timer_setup(&sk->sk_timer, sco_sock_timeout, 0);
 
 	bt_sock_link(&sco_sk_list, sk);
 	return sk;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 70ccda233bd1..c7785efeea57 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -144,9 +144,9 @@ static void send_dm_alert(struct work_struct *work)
  * in the event that more drops will arrive during the
  * hysteresis period.
  */
-static void sched_send_work(unsigned long _data)
+static void sched_send_work(struct timer_list *t)
 {
-	struct per_cpu_dm_data *data = (struct per_cpu_dm_data *)_data;
+	struct per_cpu_dm_data *data = from_timer(data, t, send_timer);
 
 	schedule_work(&data->dm_alert_work);
 }
@@ -412,8 +412,7 @@ static int __init init_net_drop_monitor(void)
 	for_each_possible_cpu(cpu) {
 		data = &per_cpu(dm_cpu_data, cpu);
 		INIT_WORK(&data->dm_alert_work, send_dm_alert);
-		setup_timer(&data->send_timer, sched_send_work,
-			    (unsigned long)data);
+		timer_setup(&data->send_timer, sched_send_work, 0);
 		spin_lock_init(&data->lock);
 		reset_per_cpu_data(data);
 	}
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 7c1ffd6f9501..9834cfa21b21 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -76,9 +76,9 @@ static void est_fetch_counters(struct net_rate_estimator *e,
 
 }
 
-static void est_timer(unsigned long arg)
+static void est_timer(struct timer_list *t)
 {
-	struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
+	struct net_rate_estimator *est = from_timer(est, t, timer);
 	struct gnet_stats_basic_packed b;
 	u64 rate, brate;
 
@@ -170,7 +170,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
 	}
 
 	est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
-	setup_timer(&est->timer, est_timer, (unsigned long)est);
+	timer_setup(&est->timer, est_timer, 0);
 	mod_timer(&est->timer, est->next_jiffies);
 
 	rcu_assign_pointer(*rate_est, est);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 6ea3a1a7f36a..d1f5fe986edd 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -51,7 +51,7 @@ do {						\
 
 #define PNEIGH_HASHMASK		0xF
 
-static void neigh_timer_handler(unsigned long arg);
+static void neigh_timer_handler(struct timer_list *t);
 static void __neigh_notify(struct neighbour *n, int type, int flags,
 			   u32 pid);
 static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
@@ -331,7 +331,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	n->output	  = neigh_blackhole;
 	seqlock_init(&n->hh.hh_lock);
 	n->parms	  = neigh_parms_clone(&tbl->parms);
-	setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n);
+	timer_setup(&n->timer, neigh_timer_handler, 0);
 
 	NEIGH_CACHE_STAT_INC(tbl, allocs);
 	n->tbl		  = tbl;
@@ -903,10 +903,10 @@ static void neigh_probe(struct neighbour *neigh)
 
 /* Called when a timer expires for a neighbour entry. */
 
-static void neigh_timer_handler(unsigned long arg)
+static void neigh_timer_handler(struct timer_list *t)
 {
 	unsigned long now, next;
-	struct neighbour *neigh = (struct neighbour *)arg;
+	struct neighbour *neigh = from_timer(neigh, t, timer);
 	unsigned int state;
 	int notify = 0;
 
@@ -1391,9 +1391,9 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(neigh_direct_output);
 
-static void neigh_proxy_process(unsigned long arg)
+static void neigh_proxy_process(struct timer_list *t)
 {
-	struct neigh_table *tbl = (struct neigh_table *)arg;
+	struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
 	long sched_next = 0;
 	unsigned long now = jiffies;
 	struct sk_buff *skb, *n;
@@ -1573,7 +1573,7 @@ void neigh_table_init(int index, struct neigh_table *tbl)
 	INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
 	queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
 			tbl->parms.reachable_time);
-	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
+	timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
 	skb_queue_head_init_class(&tbl->proxy_queue,
 			&neigh_table_proxy_queue_class);
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index de4a0cafb19f..324cb9f2f551 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -183,7 +183,7 @@ static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
 	return dn_rt_hash_mask & (unsigned int)tmp;
 }
 
-static void dn_dst_check_expire(unsigned long dummy)
+static void dn_dst_check_expire(struct timer_list *unused)
 {
 	int i;
 	struct dn_route *rt;
@@ -1875,7 +1875,7 @@ void __init dn_route_init(void)
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 	dst_entries_init(&dn_dst_ops);
-	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
+	timer_setup(&dn_route_timer, dn_dst_check_expire, 0);
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
 
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
index f430daed24a0..aa4155875ca8 100644
--- a/net/decnet/dn_timer.c
+++ b/net/decnet/dn_timer.c
@@ -34,11 +34,11 @@
 
 #define SLOW_INTERVAL (HZ/2)
 
-static void dn_slow_timer(unsigned long arg);
+static void dn_slow_timer(struct timer_list *t);
 
 void dn_start_slow_timer(struct sock *sk)
 {
-	setup_timer(&sk->sk_timer, dn_slow_timer, (unsigned long)sk);
+	timer_setup(&sk->sk_timer, dn_slow_timer, 0);
 	sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
 }
 
@@ -47,9 +47,9 @@ void dn_stop_slow_timer(struct sock *sk)
 	sk_stop_timer(sk, &sk->sk_timer);
 }
 
-static void dn_slow_timer(unsigned long arg)
+static void dn_slow_timer(struct timer_list *t)
 {
-	struct sock *sk = (struct sock *)arg;
+	struct sock *sk = from_timer(sk, t, sk_timer);
 	struct dn_scp *scp = DN_SK(sk);
 
 	bh_lock_sock(sk);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index ab183af0b5b6..d1f8f302dbf3 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -752,18 +752,18 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
 	return ip_local_out(net, skb->sk, skb);
 }
 
-static void igmp_gq_timer_expire(unsigned long data)
+static void igmp_gq_timer_expire(struct timer_list *t)
 {
-	struct in_device *in_dev = (struct in_device *)data;
+	struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
 
 	in_dev->mr_gq_running = 0;
 	igmpv3_send_report(in_dev, NULL);
 	in_dev_put(in_dev);
 }
 
-static void igmp_ifc_timer_expire(unsigned long data)
+static void igmp_ifc_timer_expire(struct timer_list *t)
 {
-	struct in_device *in_dev = (struct in_device *)data;
+	struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
 
 	igmpv3_send_cr(in_dev);
 	if (in_dev->mr_ifc_count) {
@@ -784,9 +784,9 @@ static void igmp_ifc_event(struct in_device *in_dev)
 }
 
 
-static void igmp_timer_expire(unsigned long data)
+static void igmp_timer_expire(struct timer_list *t)
 {
-	struct ip_mc_list *im = (struct ip_mc_list *)data;
+	struct ip_mc_list *im = from_timer(im, t, timer);
 	struct in_device *in_dev = im->interface;
 
 	spin_lock(&im->lock);
@@ -1385,7 +1385,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
 	refcount_set(&im->refcnt, 1);
 	spin_lock_init(&im->lock);
 #ifdef CONFIG_IP_MULTICAST
-	setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
+	timer_setup(&im->timer, igmp_timer_expire, 0);
 	im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
 #endif
 
@@ -1695,10 +1695,8 @@ void ip_mc_init_dev(struct in_device *in_dev)
 	ASSERT_RTNL();
 
 #ifdef CONFIG_IP_MULTICAST
-	setup_timer(&in_dev->mr_gq_timer, igmp_gq_timer_expire,
-			(unsigned long)in_dev);
-	setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
-			(unsigned long)in_dev);
+	timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
+	timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
 	in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
 #endif
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 40a43ad294cb..fd5f19c988e4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -112,7 +112,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
 				 int cmd);
 static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
 static void mroute_clean_tables(struct mr_table *mrt, bool all);
-static void ipmr_expire_process(unsigned long arg);
+static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
 #define ipmr_for_each_table(mrt, net) \
@@ -375,8 +375,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
 	INIT_LIST_HEAD(&mrt->mfc_cache_list);
 	INIT_LIST_HEAD(&mrt->mfc_unres_queue);
 
-	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
-		    (unsigned long)mrt);
+	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
 
 	mrt->mroute_reg_vif_num = -1;
 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -804,9 +803,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 }
 
 /* Timer process for the unresolved queue. */
-static void ipmr_expire_process(unsigned long arg)
+static void ipmr_expire_process(struct timer_list *t)
 {
-	struct mr_table *mrt = (struct mr_table *)arg;
+	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 	unsigned long now;
 	unsigned long expires;
 	struct mfc_cache *c, *next;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a0ae1c9d37df..f49bd7897e95 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -188,7 +188,7 @@ static void addrconf_dad_start(struct inet6_ifaddr *ifp);
 static void addrconf_dad_work(struct work_struct *w);
 static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id);
 static void addrconf_dad_run(struct inet6_dev *idev);
-static void addrconf_rs_timer(unsigned long data);
+static void addrconf_rs_timer(struct timer_list *t);
 static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 
@@ -388,8 +388,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
 	rwlock_init(&ndev->lock);
 	ndev->dev = dev;
 	INIT_LIST_HEAD(&ndev->addr_list);
-	setup_timer(&ndev->rs_timer, addrconf_rs_timer,
-		    (unsigned long)ndev);
+	timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0);
 	memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
 
 	if (ndev->cnf.stable_secret.initialized)
@@ -3741,9 +3740,9 @@ restart:
 	return 0;
 }
 
-static void addrconf_rs_timer(unsigned long data)
+static void addrconf_rs_timer(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, rs_timer);
 	struct net_device *dev = idev->dev;
 	struct in6_addr lladdr;
 
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 9c24b85949c1..a2e1a864eb46 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -120,7 +120,7 @@ static void mrt6msg_netlink_event(struct mr6_table *mrt, struct sk_buff *pkt);
 static int ip6mr_rtm_dumproute(struct sk_buff *skb,
 			       struct netlink_callback *cb);
 static void mroute_clean_tables(struct mr6_table *mrt, bool all);
-static void ipmr_expire_process(unsigned long arg);
+static void ipmr_expire_process(struct timer_list *t);
 
 #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
 #define ip6mr_for_each_table(mrt, net) \
@@ -320,8 +320,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
 
 	INIT_LIST_HEAD(&mrt->mfc6_unres_queue);
 
-	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
-		    (unsigned long)mrt);
+	timer_setup(&mrt->ipmr_expire_timer, ipmr_expire_process, 0);
 
 #ifdef CONFIG_IPV6_PIMSM_V2
 	mrt->mroute_reg_vif_num = -1;
@@ -888,9 +887,9 @@ static void ipmr_do_expire_process(struct mr6_table *mrt)
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
 }
 
-static void ipmr_expire_process(unsigned long arg)
+static void ipmr_expire_process(struct timer_list *t)
 {
-	struct mr6_table *mrt = (struct mr6_table *)arg;
+	struct mr6_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
 
 	if (!spin_trylock(&mfc_unres_lock)) {
 		mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 12b7c27ce5ce..fc6d7d143f2c 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -75,10 +75,10 @@ static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
 
 static void igmp6_join_group(struct ifmcaddr6 *ma);
 static void igmp6_leave_group(struct ifmcaddr6 *ma);
-static void igmp6_timer_handler(unsigned long data);
+static void igmp6_timer_handler(struct timer_list *t);
 
-static void mld_gq_timer_expire(unsigned long data);
-static void mld_ifc_timer_expire(unsigned long data);
+static void mld_gq_timer_expire(struct timer_list *t);
+static void mld_ifc_timer_expire(struct timer_list *t);
 static void mld_ifc_event(struct inet6_dev *idev);
 static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
 static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
@@ -839,7 +839,7 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
 	if (!mc)
 		return NULL;
 
-	setup_timer(&mc->mca_timer, igmp6_timer_handler, (unsigned long)mc);
+	timer_setup(&mc->mca_timer, igmp6_timer_handler, 0);
 
 	mc->mca_addr = *addr;
 	mc->idev = idev; /* reference taken by caller */
@@ -2083,9 +2083,9 @@ void ipv6_mc_dad_complete(struct inet6_dev *idev)
 	}
 }
 
-static void mld_dad_timer_expire(unsigned long data)
+static void mld_dad_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer);
 
 	mld_send_initial_cr(idev);
 	if (idev->mc_dad_count) {
@@ -2432,18 +2432,18 @@ static void igmp6_leave_group(struct ifmcaddr6 *ma)
 	}
 }
 
-static void mld_gq_timer_expire(unsigned long data)
+static void mld_gq_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer);
 
 	idev->mc_gq_running = 0;
 	mld_send_report(idev, NULL);
 	in6_dev_put(idev);
 }
 
-static void mld_ifc_timer_expire(unsigned long data)
+static void mld_ifc_timer_expire(struct timer_list *t)
 {
-	struct inet6_dev *idev = (struct inet6_dev *)data;
+	struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer);
 
 	mld_send_cr(idev);
 	if (idev->mc_ifc_count) {
@@ -2462,9 +2462,9 @@ static void mld_ifc_event(struct inet6_dev *idev)
 	mld_ifc_start_timer(idev, 1);
 }
 
-static void igmp6_timer_handler(unsigned long data)
+static void igmp6_timer_handler(struct timer_list *t)
 {
-	struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data;
+	struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer);
 
 	if (mld_in_v1_mode(ma->idev))
 		igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
@@ -2552,14 +2552,11 @@ void ipv6_mc_init_dev(struct inet6_dev *idev)
 	write_lock_bh(&idev->lock);
 	spin_lock_init(&idev->mc_lock);
 	idev->mc_gq_running = 0;
-	setup_timer(&idev->mc_gq_timer, mld_gq_timer_expire,
-			(unsigned long)idev);
+	timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0);
 	idev->mc_tomb = NULL;
 	idev->mc_ifc_count = 0;
-	setup_timer(&idev->mc_ifc_timer, mld_ifc_timer_expire,
-			(unsigned long)idev);
-	setup_timer(&idev->mc_dad_timer, mld_dad_timer_expire,
-		    (unsigned long)idev);
+	timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0);
+	timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0);
 	ipv6_mc_reset(idev);
 	write_unlock_bh(&idev->lock);
 }
diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c
index a2b904a718c6..3ccf8dfb233e 100644
--- a/net/ncsi/ncsi-manage.c
+++ b/net/ncsi/ncsi-manage.c
@@ -529,9 +529,9 @@ struct ncsi_dev *ncsi_find_dev(struct net_device *dev)
 	return NULL;
 }
 
-static void ncsi_request_timeout(unsigned long data)
+static void ncsi_request_timeout(struct timer_list *t)
 {
-	struct ncsi_request *nr = (struct ncsi_request *)data;
+	struct ncsi_request *nr = from_timer(nr, t, timer);
 	struct ncsi_dev_priv *ndp = nr->ndp;
 	unsigned long flags;
 
@@ -1577,9 +1577,7 @@ struct ncsi_dev *ncsi_register_dev(struct net_device *dev,
 	for (i = 0; i < ARRAY_SIZE(ndp->requests); i++) {
 		ndp->requests[i].id = i;
 		ndp->requests[i].ndp = ndp;
-		setup_timer(&ndp->requests[i].timer,
-			    ncsi_request_timeout,
-			    (unsigned long)&ndp->requests[i]);
+		timer_setup(&ndp->requests[i].timer, ncsi_request_timeout, 0);
 	}
 
 	spin_lock_irqsave(&ncsi_dev_lock, flags);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 64778f9a8548..d6748a8a79c5 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -67,9 +67,9 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 }
 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
 
-static void nf_ct_expectation_timed_out(unsigned long ul_expect)
+static void nf_ct_expectation_timed_out(struct timer_list *t)
 {
-	struct nf_conntrack_expect *exp = (void *)ul_expect;
+	struct nf_conntrack_expect *exp = from_timer(exp, t, timeout);
 
 	spin_lock_bh(&nf_conntrack_expect_lock);
 	nf_ct_unlink_expect(exp);
@@ -368,8 +368,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
 	/* two references : one for hash insert, one for the timer */
 	refcount_add(2, &exp->use);
 
-	setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
-		    (unsigned long)exp);
+	timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0);
 	helper = rcu_dereference_protected(master_help->helper,
 					   lockdep_is_held(&nf_conntrack_expect_lock));
 	if (helper) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index cad6498f10b0..e5afab86381c 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -151,7 +151,7 @@ instance_put(struct nfulnl_instance *inst)
 		call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu);
 }
 
-static void nfulnl_timer(unsigned long data);
+static void nfulnl_timer(struct timer_list *t);
 
 static struct nfulnl_instance *
 instance_create(struct net *net, u_int16_t group_num,
@@ -184,7 +184,7 @@ instance_create(struct net *net, u_int16_t group_num,
 	/* needs to be two, since we _put() after creation */
 	refcount_set(&inst->use, 2);
 
-	setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst);
+	timer_setup(&inst->timer, nfulnl_timer, 0);
 
 	inst->net = get_net(net);
 	inst->peer_user_ns = user_ns;
@@ -377,9 +377,9 @@ __nfulnl_flush(struct nfulnl_instance *inst)
 }
 
 static void
-nfulnl_timer(unsigned long data)
+nfulnl_timer(struct timer_list *t)
 {
-	struct nfulnl_instance *inst = (struct nfulnl_instance *)data;
+	struct nfulnl_instance *inst = from_timer(inst, t, timer);
 
 	spin_lock_bh(&inst->lock);
 	if (inst->skb)
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index daf45da448fa..ee3421ad108d 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -107,9 +107,9 @@ static void idletimer_tg_work(struct work_struct *work)
 	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
 }
 
-static void idletimer_tg_expired(unsigned long data)
+static void idletimer_tg_expired(struct timer_list *t)
 {
-	struct idletimer_tg *timer = (struct idletimer_tg *) data;
+	struct idletimer_tg *timer = from_timer(timer, t, timer);
 
 	pr_debug("timer %s expired\n", timer->attr.attr.name);
 
@@ -143,8 +143,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
 
 	list_add(&info->timer->entry, &idletimer_tg_list);
 
-	setup_timer(&info->timer->timer, idletimer_tg_expired,
-		    (unsigned long) info->timer);
+	timer_setup(&info->timer->timer, idletimer_tg_expired, 0);
 	info->timer->refcnt = 1;
 
 	mod_timer(&info->timer->timer,
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index 3ba31c194cce..0971634e5444 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -85,9 +85,10 @@ led_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static void led_timeout_callback(unsigned long data)
+static void led_timeout_callback(struct timer_list *t)
 {
-	struct xt_led_info_internal *ledinternal = (struct xt_led_info_internal *)data;
+	struct xt_led_info_internal *ledinternal = from_timer(ledinternal, t,
+							      timer);
 
 	led_trigger_event(&ledinternal->netfilter_led_trigger, LED_OFF);
 }
@@ -143,8 +144,7 @@ static int led_tg_check(const struct xt_tgchk_param *par)
 
 	/* See if we need to set up a timer */
 	if (ledinfo->delay > 0)
-		setup_timer(&ledinternal->timer, led_timeout_callback,
-			    (unsigned long)ledinternal);
+		timer_setup(&ledinternal->timer, led_timeout_callback, 0);
 
 	list_add_tail(&ledinternal->list, &xt_led_triggers);
 
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index c25e9b4179c3..074960154993 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -591,18 +591,18 @@ static int nci_close_device(struct nci_dev *ndev)
 }
 
 /* NCI command timer function */
-static void nci_cmd_timer(unsigned long arg)
+static void nci_cmd_timer(struct timer_list *t)
 {
-	struct nci_dev *ndev = (void *) arg;
+	struct nci_dev *ndev = from_timer(ndev, t, cmd_timer);
 
 	atomic_set(&ndev->cmd_cnt, 1);
 	queue_work(ndev->cmd_wq, &ndev->cmd_work);
 }
 
 /* NCI data exchange timer function */
-static void nci_data_timer(unsigned long arg)
+static void nci_data_timer(struct timer_list *t)
 {
-	struct nci_dev *ndev = (void *) arg;
+	struct nci_dev *ndev = from_timer(ndev, t, data_timer);
 
 	set_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags);
 	queue_work(ndev->rx_wq, &ndev->rx_work);
@@ -1232,10 +1232,8 @@ int nci_register_device(struct nci_dev *ndev)
 	skb_queue_head_init(&ndev->rx_q);
 	skb_queue_head_init(&ndev->tx_q);
 
-	setup_timer(&ndev->cmd_timer, nci_cmd_timer,
-		    (unsigned long) ndev);
-	setup_timer(&ndev->data_timer, nci_data_timer,
-		    (unsigned long) ndev);
+	timer_setup(&ndev->cmd_timer, nci_cmd_timer, 0);
+	timer_setup(&ndev->data_timer, nci_data_timer, 0);
 
 	mutex_init(&ndev->req_lock);
 	INIT_LIST_HEAD(&ndev->conn_info_list);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 4c7fbc6dcce7..994dc2df57e4 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -45,9 +45,9 @@ const char *const rxrpc_call_completions[NR__RXRPC_CALL_COMPLETIONS] = {
 
 struct kmem_cache *rxrpc_call_jar;
 
-static void rxrpc_call_timer_expired(unsigned long _call)
+static void rxrpc_call_timer_expired(struct timer_list *t)
 {
-	struct rxrpc_call *call = (struct rxrpc_call *)_call;
+	struct rxrpc_call *call = from_timer(call, t, timer);
 
 	_enter("%d", call->debug_id);
 
@@ -114,8 +114,7 @@ struct rxrpc_call *rxrpc_alloc_call(gfp_t gfp)
 		goto nomem_2;
 
 	mutex_init(&call->user_mutex);
-	setup_timer(&call->timer, rxrpc_call_timer_expired,
-		    (unsigned long)call);
+	timer_setup(&call->timer, rxrpc_call_timer_expired, 0);
 	INIT_WORK(&call->processor, &rxrpc_process_call);
 	INIT_LIST_HEAD(&call->link);
 	INIT_LIST_HEAD(&call->chan_wait_link);
diff --git a/net/wireless/lib80211.c b/net/wireless/lib80211.c
index 459611577d3d..801d4781a73b 100644
--- a/net/wireless/lib80211.c
+++ b/net/wireless/lib80211.c
@@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(lib80211_crypto_lock);
 static void lib80211_crypt_deinit_entries(struct lib80211_crypt_info *info,
 					  int force);
 static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info);
-static void lib80211_crypt_deinit_handler(unsigned long data);
+static void lib80211_crypt_deinit_handler(struct timer_list *t);
 
 int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
 				spinlock_t *lock)
@@ -55,8 +55,8 @@ int lib80211_crypt_info_init(struct lib80211_crypt_info *info, char *name,
 	info->lock = lock;
 
 	INIT_LIST_HEAD(&info->crypt_deinit_list);
-	setup_timer(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
-			(unsigned long)info);
+	timer_setup(&info->crypt_deinit_timer, lib80211_crypt_deinit_handler,
+		    0);
 
 	return 0;
 }
@@ -116,9 +116,10 @@ static void lib80211_crypt_quiescing(struct lib80211_crypt_info *info)
 	spin_unlock_irqrestore(info->lock, flags);
 }
 
-static void lib80211_crypt_deinit_handler(unsigned long data)
+static void lib80211_crypt_deinit_handler(struct timer_list *t)
 {
-	struct lib80211_crypt_info *info = (struct lib80211_crypt_info *)data;
+	struct lib80211_crypt_info *info = from_timer(info, t,
+						      crypt_deinit_timer);
 	unsigned long flags;
 
 	lib80211_crypt_deinit_entries(info, 0);
diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c
index e0cd04d28352..a6a8ab09b914 100644
--- a/net/x25/x25_link.c
+++ b/net/x25/x25_link.c
@@ -36,7 +36,7 @@
 LIST_HEAD(x25_neigh_list);
 DEFINE_RWLOCK(x25_neigh_list_lock);
 
-static void x25_t20timer_expiry(unsigned long);
+static void x25_t20timer_expiry(struct timer_list *);
 
 static void x25_transmit_restart_confirmation(struct x25_neigh *nb);
 static void x25_transmit_restart_request(struct x25_neigh *nb);
@@ -49,9 +49,9 @@ static inline void x25_start_t20timer(struct x25_neigh *nb)
 	mod_timer(&nb->t20timer, jiffies + nb->t20);
 }
 
-static void x25_t20timer_expiry(unsigned long param)
+static void x25_t20timer_expiry(struct timer_list *t)
 {
-	struct x25_neigh *nb = (struct x25_neigh *)param;
+	struct x25_neigh *nb = from_timer(nb, t, t20timer);
 
 	x25_transmit_restart_request(nb);
 
@@ -252,7 +252,7 @@ void x25_link_device_up(struct net_device *dev)
 		return;
 
 	skb_queue_head_init(&nb->queue);
-	setup_timer(&nb->t20timer, x25_t20timer_expiry, (unsigned long)nb);
+	timer_setup(&nb->t20timer, x25_t20timer_expiry, 0);
 
 	dev_hold(dev);
 	nb->dev      = dev;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 1f5cee2269af..065d89606888 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -556,7 +556,7 @@ out:
 	return HRTIMER_NORESTART;
 }
 
-static void xfrm_replay_timer_handler(unsigned long data);
+static void xfrm_replay_timer_handler(struct timer_list *t);
 
 struct xfrm_state *xfrm_state_alloc(struct net *net)
 {
@@ -574,8 +574,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 		INIT_HLIST_NODE(&x->byspi);
 		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
 					CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
-		setup_timer(&x->rtimer, xfrm_replay_timer_handler,
-				(unsigned long)x);
+		timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
 		x->curlft.add_time = get_seconds();
 		x->lft.soft_byte_limit = XFRM_INF;
 		x->lft.soft_packet_limit = XFRM_INF;
@@ -1879,9 +1878,9 @@ void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net)
 }
 EXPORT_SYMBOL(xfrm_state_walk_done);
 
-static void xfrm_replay_timer_handler(unsigned long data)
+static void xfrm_replay_timer_handler(struct timer_list *t)
 {
-	struct xfrm_state *x = (struct xfrm_state *)data;
+	struct xfrm_state *x = from_timer(x, t, rtimer);
 
 	spin_lock(&x->lock);
 
-- 
cgit v1.3-14-g43fede


From c1eba5bcb6430868427e0b9d1cd1205a07302f06 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:18:19 -0700
Subject: timer: Pass timer_list pointer to callbacks unconditionally

Now that all timer callbacks are already taking their struct timer_list
pointer as the callback argument, just do this unconditionally and remove
the .data field.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h |  4 ----
 kernel/time/timer.c   | 17 +++++++----------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 47615dca4c5c..20a6e7af5fd6 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -18,7 +18,6 @@ struct timer_list {
 	struct hlist_node	entry;
 	unsigned long		expires;
 	void			(*function)(unsigned long);
-	unsigned long		data;
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -70,7 +69,6 @@ struct timer_list {
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
-		.data = (_data),				\
 		.flags = (_flags),				\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
@@ -121,14 +119,12 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
 	do {								\
 		__init_timer((_timer), (_flags));			\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _data, _flags)		\
 	do {								\
 		__init_timer_on_stack((_timer), (_flags));		\
 		(_timer)->function = (_fn);				\
-		(_timer)->data = (_data);				\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..a07eb124332f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1107,12 +1107,12 @@ EXPORT_SYMBOL(timer_reduce);
  * add_timer - start a timer
  * @timer: the timer to be added
  *
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
  * timer interrupt at the ->expires point in the future. The
  * current time is 'jiffies'.
  *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
  *
  * Timers with an ->expires field in the past will be executed in the next
  * timer tick.
@@ -1284,8 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
-			  unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 {
 	int count = preempt_count();
 
@@ -1309,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn(data);
+	fn((TIMER_DATA_TYPE)timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1332,7 +1331,6 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
 		void (*fn)(unsigned long);
-		unsigned long data;
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
@@ -1340,15 +1338,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		detach_timer(timer, true);
 
 		fn = timer->function;
-		data = timer->data;
 
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
-- 
cgit v1.3-14-g43fede


From 354b46b1a0adda1dd5b7f0bc2a5604cca091be5f Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 19:15:40 -0700
Subject: timer: Switch callback prototype to take struct timer_list * argument

Since all callbacks have been converted, we can switch the core
prototype to "struct timer_list *" now too.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 4 ++--
 kernel/time/timer.c   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 20a6e7af5fd6..a6d04fb72c9e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -17,7 +17,7 @@ struct timer_list {
 	 */
 	struct hlist_node	entry;
 	unsigned long		expires;
-	void			(*function)(unsigned long);
+	void			(*function)(struct timer_list *);
 	u32			flags;
 
 #ifdef CONFIG_LOCKDEP
@@ -63,7 +63,7 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		unsigned long
+#define TIMER_DATA_TYPE		struct timer_list *
 #define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
 
 #define __TIMER_INITIALIZER(_function, _data, _flags) {		\
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a07eb124332f..0f0d49a02d04 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1284,7 +1284,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
 {
 	int count = preempt_count();
 
@@ -1308,7 +1308,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long))
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn((TIMER_DATA_TYPE)timer);
+	fn(timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1330,7 +1330,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
-		void (*fn)(unsigned long);
+		void (*fn)(struct timer_list *);
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
-- 
cgit v1.3-14-g43fede


From 188665b2d67db8953899551d1a9d4481b2a0ac60 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Sun, 22 Oct 2017 18:14:46 -0700
Subject: timer: Pass function down to initialization routines

In preparation for removing more macros, pass the function down to the
initialization routines instead of doing it in macros.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 include/linux/timer.h | 33 ++++++++++++++++++---------------
 kernel/time/timer.c   | 21 +++++++++++++++------
 2 files changed, 33 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index e6bab51db13d..aff73b1c8f7b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -78,53 +78,56 @@ struct timer_list {
 	struct timer_list _name =				\
 		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
 
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key);
 
 #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
 extern void init_timer_on_stack_key(struct timer_list *timer,
+				    void (*func)(struct timer_list *),
 				    unsigned int flags, const char *name,
 				    struct lock_class_key *key);
 extern void destroy_timer_on_stack(struct timer_list *timer);
 #else
 static inline void destroy_timer_on_stack(struct timer_list *timer) { }
 static inline void init_timer_on_stack_key(struct timer_list *timer,
-					   unsigned int flags, const char *name,
+					   void (*func)(struct timer_list *),
+					   unsigned int flags,
+					   const char *name,
 					   struct lock_class_key *key)
 {
-	init_timer_key(timer, flags, name, key);
+	init_timer_key(timer, func, flags, name, key);
 }
 #endif
 
 #ifdef CONFIG_LOCKDEP
-#define __init_timer(_timer, _flags)					\
+#define __init_timer(_timer, _fn, _flags)				\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_key((_timer), (_flags), #_timer, &__key);	\
+		init_timer_key((_timer), (_fn), (_flags), #_timer, &__key);\
 	} while (0)
 
-#define __init_timer_on_stack(_timer, _flags)				\
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
 		static struct lock_class_key __key;			\
-		init_timer_on_stack_key((_timer), (_flags), #_timer, &__key); \
+		init_timer_on_stack_key((_timer), (_fn), (_flags),	\
+					#_timer, &__key);		 \
 	} while (0)
 #else
-#define __init_timer(_timer, _flags)					\
-	init_timer_key((_timer), (_flags), NULL, NULL)
-#define __init_timer_on_stack(_timer, _flags)				\
-	init_timer_on_stack_key((_timer), (_flags), NULL, NULL)
+#define __init_timer(_timer, _fn, _flags)				\
+	init_timer_key((_timer), (_fn), (_flags), NULL, NULL)
+#define __init_timer_on_stack(_timer, _fn, _flags)			\
+	init_timer_on_stack_key((_timer), (_fn), (_flags), NULL, NULL)
 #endif
 
 #define __setup_timer(_timer, _fn, _flags)				\
 	do {								\
-		__init_timer((_timer), (_flags));			\
-		(_timer)->function = (_fn);				\
+		__init_timer((_timer), (_fn), (_flags));		\
 	} while (0)
 
 #define __setup_timer_on_stack(_timer, _fn, _flags)			\
 	do {								\
-		__init_timer_on_stack((_timer), (_flags));		\
-		(_timer)->function = (_fn);				\
+		__init_timer_on_stack((_timer), (_fn), (_flags));	\
 	} while (0)
 
 #ifndef CONFIG_LOCKDEP
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0f0d49a02d04..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
 	debug_object_assert_init(timer, &timer_debug_descr);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key);
 
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+			     void (*func)(struct timer_list *),
+			     unsigned int flags,
 			     const char *name, struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
 	debug_timer_assert_init(timer);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
 	timer->entry.pprev = NULL;
+	timer->function = func;
 	timer->flags = flags | raw_smp_processor_id();
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
+ * @func: timer callback function
  * @flags: timer flags
  * @name: name of the timer
  * @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
  * init_timer_key() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key)
 {
 	debug_init(timer);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL(init_timer_key);
 
-- 
cgit v1.3-14-g43fede


From 841b86f3289dbe858daeceec36423d4ea286fac2 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Mon, 23 Oct 2017 09:40:42 +0200
Subject: treewide: Remove TIMER_FUNC_TYPE and TIMER_DATA_TYPE casts

With all callbacks converted, and the timer callback prototype
switched over, the TIMER_FUNC_TYPE cast is no longer needed,
so remove it. Conversion was done with the following scripts:

    perl -pi -e 's|\(TIMER_FUNC_TYPE\)||g' \
        $(git grep TIMER_FUNC_TYPE | cut -d: -f1 | sort -u)

    perl -pi -e 's|\(TIMER_DATA_TYPE\)||g' \
        $(git grep TIMER_DATA_TYPE | cut -d: -f1 | sort -u)

The now unused macros are also dropped from include/linux/timer.h.

Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/base/power/wakeup.c                       |  2 +-
 drivers/block/aoe/aoecmd.c                        |  2 +-
 drivers/block/swim3.c                             |  2 +-
 drivers/infiniband/hw/nes/nes_verbs.c             |  2 +-
 drivers/input/input.c                             |  2 +-
 drivers/media/common/saa7146/saa7146_vbi.c        |  2 +-
 drivers/net/ethernet/ti/tlan.c                    |  6 +++---
 drivers/net/hamradio/scc.c                        |  8 ++++----
 drivers/net/wireless/quantenna/qtnfmac/cfg80211.c |  2 +-
 drivers/net/wireless/ray_cs.c                     | 12 ++++++------
 drivers/s390/char/sclp.c                          |  4 ++--
 drivers/s390/scsi/zfcp_fsf.c                      |  4 ++--
 drivers/scsi/aic94xx/aic94xx_hwi.c                |  2 +-
 drivers/scsi/aic94xx/aic94xx_tmf.c                |  2 +-
 drivers/scsi/be2iscsi/be_main.c                   |  4 ++--
 drivers/scsi/cxgbi/cxgb3i/cxgb3i.c                |  4 ++--
 drivers/scsi/cxgbi/cxgb4i/cxgb4i.c                |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_main.c             |  4 ++--
 drivers/scsi/hisi_sas/hisi_sas_v2_hw.c            |  6 +++---
 drivers/scsi/ipr.c                                |  8 ++++----
 drivers/scsi/libfc/fc_fcp.c                       |  6 +++---
 drivers/scsi/libsas/sas_expander.c                |  2 +-
 drivers/scsi/libsas/sas_scsi_host.c               |  2 +-
 drivers/scsi/mvsas/mv_sas.c                       |  4 ++--
 drivers/scsi/pm8001/pm8001_sas.c                  |  4 ++--
 drivers/scsi/pmcraid.c                            | 10 +++++-----
 drivers/staging/irda/include/net/irda/timer.h     |  2 +-
 drivers/tty/serial/8250/8250_core.c               |  4 ++--
 include/linux/kthread.h                           |  2 +-
 include/linux/timer.h                             |  5 +----
 include/linux/workqueue.h                         |  2 +-
 kernel/kthread.c                                  |  2 +-
 kernel/workqueue.c                                |  2 +-
 net/atm/lec.c                                     |  6 +++---
 net/can/proc.c                                    |  4 ++--
 net/lapb/lapb_timer.c                             |  4 ++--
 net/netrom/af_netrom.c                            |  2 +-
 net/netrom/nr_timer.c                             |  2 +-
 net/rose/rose_link.c                              |  4 ++--
 net/rose/rose_timer.c                             | 12 ++++++------
 net/sunrpc/svc_xprt.c                             |  2 +-
 net/x25/af_x25.c                                  |  2 +-
 net/x25/x25_timer.c                               |  2 +-
 sound/usb/line6/driver.c                          |  2 +-
 44 files changed, 84 insertions(+), 87 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 680ee1d36ac9..38559f04db2c 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -481,7 +481,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
 	 * Use timer struct to check if the given source is initialized
 	 * by wakeup_source_add.
 	 */
-	return ws->timer.function != (TIMER_FUNC_TYPE)pm_wakeup_timer_fn;
+	return ws->timer.function != pm_wakeup_timer_fn;
 }
 
 /*
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 55ab25f79a08..812fed069708 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1429,7 +1429,7 @@ aoecmd_ata_id(struct aoedev *d)
 
 	d->rttavg = RTTAVG_INIT;
 	d->rttdev = RTTDEV_INIT;
-	d->timer.function = (TIMER_FUNC_TYPE)rexmit_timer;
+	d->timer.function = rexmit_timer;
 
 	skb = skb_clone(skb, GFP_ATOMIC);
 	if (skb) {
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index e620e423102b..af51015d056e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -397,7 +397,7 @@ static void set_timeout(struct floppy_state *fs, int nticks,
 	if (fs->timeout_pending)
 		del_timer(&fs->timeout);
 	fs->timeout.expires = jiffies + nticks;
-	fs->timeout.function = (TIMER_FUNC_TYPE)proc;
+	fs->timeout.function = proc;
 	add_timer(&fs->timeout);
 	fs->timeout_pending = 1;
 }
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
index db46b7b53fb4..162475aeeedd 100644
--- a/drivers/infiniband/hw/nes/nes_verbs.c
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -3819,7 +3819,7 @@ void  nes_port_ibevent(struct nes_vnic *nesvnic)
 	if (!nesvnic->event_timer.function) {
 		ib_dispatch_event(&event);
 		nesvnic->last_dispatched_event = event.event;
-		nesvnic->event_timer.function = (TIMER_FUNC_TYPE)nes_handle_delayed_event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
 		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
 		add_timer(&nesvnic->event_timer);
 	} else {
diff --git a/drivers/input/input.c b/drivers/input/input.c
index 44916ef4a424..e30642db50d5 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -2047,7 +2047,7 @@ static void devm_input_device_unregister(struct device *dev, void *res)
  */
 void input_enable_softrepeat(struct input_dev *dev, int delay, int period)
 {
-	dev->timer.function = (TIMER_FUNC_TYPE)input_repeat_key;
+	dev->timer.function = input_repeat_key;
 	dev->rep[REP_DELAY] = delay;
 	dev->rep[REP_PERIOD] = period;
 }
diff --git a/drivers/media/common/saa7146/saa7146_vbi.c b/drivers/media/common/saa7146/saa7146_vbi.c
index ce8d78c137f0..e1d369b976ed 100644
--- a/drivers/media/common/saa7146/saa7146_vbi.c
+++ b/drivers/media/common/saa7146/saa7146_vbi.c
@@ -402,7 +402,7 @@ static int vbi_open(struct saa7146_dev *dev, struct file *file)
 			    sizeof(struct saa7146_buf),
 			    file, &dev->v4l2_lock);
 
-	vv->vbi_read_timeout.function = (TIMER_FUNC_TYPE)vbi_read_timeout;
+	vv->vbi_read_timeout.function = vbi_read_timeout;
 	vv->vbi_read_timeout_file = file;
 
 	/* initialize the brs */
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 8f53d762fbc4..5a4e78fde530 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -254,7 +254,7 @@ tlan_set_timer(struct net_device *dev, u32 ticks, u32 type)
 			spin_unlock_irqrestore(&priv->lock, flags);
 		return;
 	}
-	priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+	priv->timer.function = tlan_timer;
 	if (!in_irq())
 		spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -1425,7 +1425,7 @@ static u32 tlan_handle_tx_eof(struct net_device *dev, u16 host_int)
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL) {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
@@ -1576,7 +1576,7 @@ drop_and_reuse:
 		tlan_dio_write8(dev->base_addr,
 				TLAN_LED_REG, TLAN_LED_LINK | TLAN_LED_ACT);
 		if (priv->timer.function == NULL)  {
-			priv->timer.function = (TIMER_FUNC_TYPE)tlan_timer;
+			priv->timer.function = tlan_timer;
 			priv->timer.expires = jiffies + TLAN_TIMER_ACT_DELAY;
 			priv->timer_set_at = jiffies;
 			priv->timer_type = TLAN_TIMER_ACTIVITY;
diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c
index c9f7215c5dc2..3de272959090 100644
--- a/drivers/net/hamradio/scc.c
+++ b/drivers/net/hamradio/scc.c
@@ -1005,7 +1005,7 @@ static void __scc_start_tx_timer(struct scc_channel *scc,
 	} else 
 	if (when != TIMER_OFF)
 	{
-		scc->tx_t.function = (TIMER_FUNC_TYPE)handler;
+		scc->tx_t.function = handler;
 		scc->tx_t.expires = jiffies + (when*HZ)/100;
 		add_timer(&scc->tx_t);
 	}
@@ -1031,7 +1031,7 @@ static void scc_start_defer(struct scc_channel *scc)
 	
 	if (scc->kiss.maxdefer != 0 && scc->kiss.maxdefer != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_busy;
+		scc->tx_wdog.function = t_busy;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxdefer;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1047,7 +1047,7 @@ static void scc_start_maxkeyup(struct scc_channel *scc)
 	
 	if (scc->kiss.maxkeyup != 0 && scc->kiss.maxkeyup != TIMER_OFF)
 	{
-		scc->tx_wdog.function = (TIMER_FUNC_TYPE)t_maxkeyup;
+		scc->tx_wdog.function = t_maxkeyup;
 		scc->tx_wdog.expires = jiffies + HZ*scc->kiss.maxkeyup;
 		add_timer(&scc->tx_wdog);
 	}
@@ -1428,7 +1428,7 @@ scc_start_calibrate(struct scc_channel *scc, int duration, unsigned char pattern
 
 	del_timer(&scc->tx_wdog);
 
-	scc->tx_wdog.function = (TIMER_FUNC_TYPE)scc_stop_calibrate;
+	scc->tx_wdog.function = scc_stop_calibrate;
 	scc->tx_wdog.expires = jiffies + HZ*duration;
 	add_timer(&scc->tx_wdog);
 
diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
index 7d6dc76c930a..6711e7fb6926 100644
--- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
+++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c
@@ -554,7 +554,7 @@ qtnf_scan(struct wiphy *wiphy, struct cfg80211_scan_request *request)
 		return -EFAULT;
 	}
 
-	mac->scan_timeout.function = (TIMER_FUNC_TYPE)qtnf_scan_timeout;
+	mac->scan_timeout.function = qtnf_scan_timeout;
 	mod_timer(&mac->scan_timeout,
 		  jiffies + QTNF_SCAN_TIMEOUT_SEC * HZ);
 
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index d8afcdfca1ed..0133fcd4601b 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -569,7 +569,7 @@ static int dl_startup_params(struct net_device *dev)
 	local->card_status = CARD_DL_PARAM;
 	/* Start kernel timer to wait for dl startup to complete. */
 	local->timer.expires = jiffies + HZ / 2;
-	local->timer.function = (TIMER_FUNC_TYPE)verify_dl_startup;
+	local->timer.function = verify_dl_startup;
 	add_timer(&local->timer);
 	dev_dbg(&link->dev,
 	      "ray_cs dl_startup_params started timer for verify_dl_startup\n");
@@ -1947,12 +1947,12 @@ static irqreturn_t ray_interrupt(int irq, void *dev_id)
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" start failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)start_net;
+					local->timer.function = start_net;
 				} else {
 					dev_dbg(&link->dev,
 					      "ray_cs interrupt network \"%s\" join failed\n",
 					      memtmp);
-					local->timer.function = (TIMER_FUNC_TYPE)join_net;
+					local->timer.function = join_net;
 				}
 				add_timer(&local->timer);
 			}
@@ -2417,9 +2417,9 @@ static void authenticate(ray_dev_t *local)
 
 	del_timer(&local->timer);
 	if (build_auth_frame(local, local->bss_id, OPEN_AUTH_REQUEST)) {
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 	} else {
-		local->timer.function = (TIMER_FUNC_TYPE)authenticate_timeout;
+		local->timer.function = authenticate_timeout;
 	}
 	local->timer.expires = jiffies + HZ * 2;
 	add_timer(&local->timer);
@@ -2502,7 +2502,7 @@ static void associate(ray_dev_t *local)
 
 		del_timer(&local->timer);
 		local->timer.expires = jiffies + HZ * 2;
-		local->timer.function = (TIMER_FUNC_TYPE)join_net;
+		local->timer.function = join_net;
 		add_timer(&local->timer);
 		local->card_status = CARD_ASSOC_FAILED;
 		return;
diff --git a/drivers/s390/char/sclp.c b/drivers/s390/char/sclp.c
index 9b4c61c1e309..e4e2df7a478e 100644
--- a/drivers/s390/char/sclp.c
+++ b/drivers/s390/char/sclp.c
@@ -158,7 +158,7 @@ static inline void
 __sclp_set_request_timer(unsigned long time, void (*cb)(struct timer_list *))
 {
 	del_timer(&sclp_request_timer);
-	sclp_request_timer.function = (TIMER_FUNC_TYPE)cb;
+	sclp_request_timer.function = cb;
 	sclp_request_timer.expires = jiffies + time;
 	add_timer(&sclp_request_timer);
 }
@@ -566,7 +566,7 @@ sclp_sync_wait(void)
 		if (timer_pending(&sclp_request_timer) &&
 		    get_tod_clock_fast() > timeout &&
 		    del_timer(&sclp_request_timer))
-			sclp_request_timer.function((TIMER_DATA_TYPE)&sclp_request_timer);
+			sclp_request_timer.function(&sclp_request_timer);
 		cpu_relax();
 	}
 	local_irq_disable();
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index 51b81c0a0652..b12cb81ad8a2 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -34,7 +34,7 @@ static void zfcp_fsf_request_timeout_handler(struct timer_list *t)
 static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 				 unsigned long timeout)
 {
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_fsf_request_timeout_handler;
+	fsf_req->timer.function = zfcp_fsf_request_timeout_handler;
 	fsf_req->timer.expires = jiffies + timeout;
 	add_timer(&fsf_req->timer);
 }
@@ -42,7 +42,7 @@ static void zfcp_fsf_start_timer(struct zfcp_fsf_req *fsf_req,
 static void zfcp_fsf_start_erp_timer(struct zfcp_fsf_req *fsf_req)
 {
 	BUG_ON(!fsf_req->erp_action);
-	fsf_req->timer.function = (TIMER_FUNC_TYPE)zfcp_erp_timeout_handler;
+	fsf_req->timer.function = zfcp_erp_timeout_handler;
 	fsf_req->timer.expires = jiffies + 30 * HZ;
 	add_timer(&fsf_req->timer);
 }
diff --git a/drivers/scsi/aic94xx/aic94xx_hwi.c b/drivers/scsi/aic94xx/aic94xx_hwi.c
index 5402b85b0bdc..2dbc8330d7d3 100644
--- a/drivers/scsi/aic94xx/aic94xx_hwi.c
+++ b/drivers/scsi/aic94xx/aic94xx_hwi.c
@@ -1175,7 +1175,7 @@ static void asd_start_scb_timers(struct list_head *list)
 	struct asd_ascb *ascb;
 	list_for_each_entry(ascb, list, list) {
 		if (!ascb->uldd_timer) {
-			ascb->timer.function = (TIMER_FUNC_TYPE)asd_ascb_timedout;
+			ascb->timer.function = asd_ascb_timedout;
 			ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 			add_timer(&ascb->timer);
 		}
diff --git a/drivers/scsi/aic94xx/aic94xx_tmf.c b/drivers/scsi/aic94xx/aic94xx_tmf.c
index 4637119c09d8..2a01702d5ba7 100644
--- a/drivers/scsi/aic94xx/aic94xx_tmf.c
+++ b/drivers/scsi/aic94xx/aic94xx_tmf.c
@@ -42,7 +42,7 @@ static int asd_enqueue_internal(struct asd_ascb *ascb,
 	ascb->tasklet_complete = tasklet_complete;
 	ascb->uldd_timer = 1;
 
-	ascb->timer.function = (TIMER_FUNC_TYPE)timed_out;
+	ascb->timer.function = timed_out;
 	ascb->timer.expires = jiffies + AIC94XX_SCB_TIMEOUT;
 
 	add_timer(&ascb->timer);
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index be96aa1e5077..b3cfdd5f4d1c 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -5279,7 +5279,7 @@ static void beiscsi_hw_health_check(struct timer_list *t)
 		if (!test_bit(BEISCSI_HBA_UER_SUPP, &phba->state))
 			return;
 		/* modify this timer to check TPE */
-		phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_tpe_check;
+		phba->hw_check.function = beiscsi_hw_tpe_check;
 	}
 
 	mod_timer(&phba->hw_check,
@@ -5367,7 +5367,7 @@ static int beiscsi_enable_port(struct beiscsi_hba *phba)
 	 * Timer function gets modified for TPE detection.
 	 * Always reinit to do health check first.
 	 */
-	phba->hw_check.function = (TIMER_FUNC_TYPE)beiscsi_hw_health_check;
+	phba->hw_check.function = beiscsi_hw_health_check;
 	mod_timer(&phba->hw_check,
 		  jiffies + msecs_to_jiffies(BEISCSI_UE_DETECT_INTERVAL));
 	return 0;
diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
index babd79361a46..bf07735275a4 100644
--- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
+++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c
@@ -586,8 +586,8 @@ static int do_act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
 	cxgbi_sock_get(csk);
 	spin_lock_bh(&csk->lock);
 	if (rpl->status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)act_open_retry_timer;
+	    csk->retry_timer.function != act_open_retry_timer) {
+		csk->retry_timer.function = act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
index 266eddf17a99..406e94312d4e 100644
--- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
+++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
@@ -963,8 +963,8 @@ static void do_act_open_rpl(struct cxgbi_device *cdev, struct sk_buff *skb)
 	spin_lock_bh(&csk->lock);
 
 	if (status == CPL_ERR_CONN_EXIST &&
-	    csk->retry_timer.function != (TIMER_FUNC_TYPE)csk_act_open_retry_timer) {
-		csk->retry_timer.function = (TIMER_FUNC_TYPE)csk_act_open_retry_timer;
+	    csk->retry_timer.function != csk_act_open_retry_timer) {
+		csk->retry_timer.function = csk_act_open_retry_timer;
 		mod_timer(&csk->retry_timer, jiffies + HZ / 2);
 	} else
 		cxgbi_sock_fail_act_open(csk,
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c
index 61a85ff8e459..5f503cb09508 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_main.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_main.c
@@ -839,7 +839,7 @@ static int hisi_sas_exec_internal_tmf_task(struct domain_device *device,
 		}
 		task->task_done = hisi_sas_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+		task->slow_task->timer.function = hisi_sas_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -1451,7 +1451,7 @@ hisi_sas_internal_task_abort(struct hisi_hba *hisi_hba,
 	task->dev = device;
 	task->task_proto = device->tproto;
 	task->task_done = hisi_sas_task_done;
-	task->slow_task->timer.function = (TIMER_FUNC_TYPE)hisi_sas_tmf_timedout;
+	task->slow_task->timer.function = hisi_sas_tmf_timedout;
 	task->slow_task->timer.expires = jiffies + msecs_to_jiffies(110);
 	add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
index d02c2a791981..5d3467fd728d 100644
--- a/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
+++ b/drivers/scsi/hisi_sas/hisi_sas_v2_hw.c
@@ -1268,7 +1268,7 @@ static void link_timeout_enable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(900));
 }
 
@@ -1289,13 +1289,13 @@ static void link_timeout_disable_link(struct timer_list *t)
 		}
 	}
 
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_enable_link;
+	hisi_hba->timer.function = link_timeout_enable_link;
 	mod_timer(&hisi_hba->timer, jiffies + msecs_to_jiffies(100));
 }
 
 static void set_link_timer_quirk(struct hisi_hba *hisi_hba)
 {
-	hisi_hba->timer.function = (TIMER_FUNC_TYPE)link_timeout_disable_link;
+	hisi_hba->timer.function = link_timeout_disable_link;
 	hisi_hba->timer.expires = jiffies + msecs_to_jiffies(1000);
 	add_timer(&hisi_hba->timer);
 }
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index d53429371127..cc0187965eee 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -997,7 +997,7 @@ static void ipr_do_req(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = done;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+	ipr_cmd->timer.function = timeout_func;
 
 	add_timer(&ipr_cmd->timer);
 
@@ -8312,7 +8312,7 @@ static void ipr_reset_start_timer(struct ipr_cmnd *ipr_cmd,
 	ipr_cmd->done = ipr_reset_ioa_job;
 
 	ipr_cmd->timer.expires = jiffies + timeout;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_reset_timer_done;
+	ipr_cmd->timer.function = ipr_reset_timer_done;
 	add_timer(&ipr_cmd->timer);
 }
 
@@ -8397,7 +8397,7 @@ static int ipr_reset_next_stage(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + stage_time * HZ;
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 
@@ -8468,7 +8468,7 @@ static int ipr_reset_enable_ioa(struct ipr_cmnd *ipr_cmd)
 	}
 
 	ipr_cmd->timer.expires = jiffies + (ioa_cfg->transop_timeout * HZ);
-	ipr_cmd->timer.function = (TIMER_FUNC_TYPE)ipr_oper_timeout;
+	ipr_cmd->timer.function = ipr_oper_timeout;
 	ipr_cmd->done = ipr_reset_ioa_job;
 	add_timer(&ipr_cmd->timer);
 	list_add_tail(&ipr_cmd->queue, &ipr_cmd->hrrq->hrrq_pending_q);
diff --git a/drivers/scsi/libfc/fc_fcp.c b/drivers/scsi/libfc/fc_fcp.c
index 1a4e701a8449..4fae253d4f3d 100644
--- a/drivers/scsi/libfc/fc_fcp.c
+++ b/drivers/scsi/libfc/fc_fcp.c
@@ -1214,7 +1214,7 @@ static int fc_fcp_cmd_send(struct fc_lport *lport, struct fc_fcp_pkt *fsp,
 	fsp->seq_ptr = seq;
 	fc_fcp_pkt_hold(fsp);	/* hold for fc_fcp_pkt_destroy */
 
-	fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+	fsp->timer.function = fc_fcp_timeout;
 	if (rpriv->flags & FC_RP_FLAGS_REC_SUPPORTED)
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 
@@ -1307,7 +1307,7 @@ static void fc_lun_reset_send(struct timer_list *t)
 			return;
 		if (fc_fcp_lock_pkt(fsp))
 			return;
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_lun_reset_send;
+		fsp->timer.function = fc_lun_reset_send;
 		fc_fcp_timer_set(fsp, get_fsp_rec_tov(fsp));
 		fc_fcp_unlock_pkt(fsp);
 	}
@@ -1445,7 +1445,7 @@ static void fc_fcp_timeout(struct timer_list *t)
 	if (fsp->lp->qfull) {
 		FC_FCP_DBG(fsp, "fcp timeout, resetting timer delay %d\n",
 			   fsp->timer_delay);
-		fsp->timer.function = (TIMER_FUNC_TYPE)fc_fcp_timeout;
+		fsp->timer.function = fc_fcp_timeout;
 		fc_fcp_timer_set(fsp, fsp->timer_delay);
 		goto unlock;
 	}
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 174e5eff6155..ca1566237ae7 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -92,7 +92,7 @@ static int smp_execute_task_sg(struct domain_device *dev,
 
 		task->task_done = smp_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)smp_task_timedout;
+		task->slow_task->timer.function = smp_task_timedout;
 		task->slow_task->timer.expires = jiffies + SMP_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 91795eb56206..58476b728c57 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -919,7 +919,7 @@ void sas_task_abort(struct sas_task *task)
 			return;
 		if (!del_timer(&slow->timer))
 			return;
-		slow->timer.function((TIMER_DATA_TYPE)&slow->timer);
+		slow->timer.function(&slow->timer);
 		return;
 	}
 
diff --git a/drivers/scsi/mvsas/mv_sas.c b/drivers/scsi/mvsas/mv_sas.c
index cff1c37b8d2e..cff43bd9f675 100644
--- a/drivers/scsi/mvsas/mv_sas.c
+++ b/drivers/scsi/mvsas/mv_sas.c
@@ -1310,7 +1310,7 @@ static int mvs_exec_internal_tmf_task(struct domain_device *dev,
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = mvs_task_done;
 
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)mvs_tmf_timedout;
+		task->slow_task->timer.function = mvs_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + MVS_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -2020,7 +2020,7 @@ void mvs_int_port(struct mvs_info *mvi, int phy_no, u32 events)
 		MVS_CHIP_DISP->write_port_irq_mask(mvi, phy_no,
 					tmp | PHYEV_SIG_FIS);
 		if (phy->timer.function == NULL) {
-			phy->timer.function = (TIMER_FUNC_TYPE)mvs_sig_time_out;
+			phy->timer.function = mvs_sig_time_out;
 			phy->timer.expires = jiffies + 5*HZ;
 			add_timer(&phy->timer);
 		}
diff --git a/drivers/scsi/pm8001/pm8001_sas.c b/drivers/scsi/pm8001/pm8001_sas.c
index 0e294e80c169..947d6017d004 100644
--- a/drivers/scsi/pm8001/pm8001_sas.c
+++ b/drivers/scsi/pm8001/pm8001_sas.c
@@ -695,7 +695,7 @@ static int pm8001_exec_internal_tmf_task(struct domain_device *dev,
 		task->task_proto = dev->tproto;
 		memcpy(&task->ssp_task, parameter, para_len);
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT*HZ;
 		add_timer(&task->slow_task->timer);
 
@@ -781,7 +781,7 @@ pm8001_exec_internal_task_abort(struct pm8001_hba_info *pm8001_ha,
 		task->dev = dev;
 		task->task_proto = dev->tproto;
 		task->task_done = pm8001_task_done;
-		task->slow_task->timer.function = (TIMER_FUNC_TYPE)pm8001_tmf_timedout;
+		task->slow_task->timer.function = pm8001_tmf_timedout;
 		task->slow_task->timer.expires = jiffies + PM8001_TASK_TIMEOUT * HZ;
 		add_timer(&task->slow_task->timer);
 
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index 4f9f115fb6a0..e58be98430b0 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -604,7 +604,7 @@ static void pmcraid_start_bist(struct pmcraid_cmd *cmd)
 
 	cmd->time_left = msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
 	cmd->timer.expires = jiffies + msecs_to_jiffies(PMCRAID_BIST_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_bist_done;
+	cmd->timer.function = pmcraid_bist_done;
 	add_timer(&cmd->timer);
 }
 
@@ -636,7 +636,7 @@ static void pmcraid_reset_alert_done(struct timer_list *t)
 		/* restart timer if some more time is available to wait */
 		cmd->time_left -= PMCRAID_CHECK_FOR_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 	}
 }
@@ -673,7 +673,7 @@ static void pmcraid_reset_alert(struct pmcraid_cmd *cmd)
 		 */
 		cmd->time_left = PMCRAID_RESET_TIMEOUT;
 		cmd->timer.expires = jiffies + PMCRAID_CHECK_FOR_RESET_TIMEOUT;
-		cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_reset_alert_done;
+		cmd->timer.function = pmcraid_reset_alert_done;
 		add_timer(&cmd->timer);
 
 		iowrite32(DOORBELL_IOA_RESET_ALERT,
@@ -923,7 +923,7 @@ static void pmcraid_send_cmd(
 	if (timeout_func) {
 		/* setup timeout handler */
 		cmd->timer.expires = jiffies + timeout;
-		cmd->timer.function = (TIMER_FUNC_TYPE)timeout_func;
+		cmd->timer.function = timeout_func;
 		add_timer(&cmd->timer);
 	}
 
@@ -1951,7 +1951,7 @@ static void pmcraid_soft_reset(struct pmcraid_cmd *cmd)
 	cmd->cmd_done = pmcraid_ioa_reset;
 	cmd->timer.expires = jiffies +
 			     msecs_to_jiffies(PMCRAID_TRANSOP_TIMEOUT);
-	cmd->timer.function = (TIMER_FUNC_TYPE)pmcraid_timeout_handler;
+	cmd->timer.function = pmcraid_timeout_handler;
 
 	if (!timer_pending(&cmd->timer))
 		add_timer(&cmd->timer);
diff --git a/drivers/staging/irda/include/net/irda/timer.h b/drivers/staging/irda/include/net/irda/timer.h
index a6635f0afae9..6dab15f5dae1 100644
--- a/drivers/staging/irda/include/net/irda/timer.h
+++ b/drivers/staging/irda/include/net/irda/timer.h
@@ -75,7 +75,7 @@ struct lap_cb;
 static inline void irda_start_timer(struct timer_list *ptimer, int timeout,
 				    void (*callback)(struct timer_list *))
 {
-	ptimer->function = (TIMER_FUNC_TYPE) callback;
+	ptimer->function =  callback;
 
 	/* Set new value for timer (update or add timer).
 	 * We use mod_timer() because it's more efficient and also
diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c
index d64afdd93872..9342fc2ee7df 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -325,7 +325,7 @@ static int univ8250_setup_irq(struct uart_8250_port *up)
 	if (up->bugs & UART_BUG_THRE) {
 		pr_debug("ttyS%d - using backup timer\n", serial_index(port));
 
-		up->timer.function = (TIMER_FUNC_TYPE)serial8250_backup_timeout;
+		up->timer.function = serial8250_backup_timeout;
 		mod_timer(&up->timer, jiffies +
 			  uart_poll_timeout(port) + HZ / 5);
 	}
@@ -348,7 +348,7 @@ static void univ8250_release_irq(struct uart_8250_port *up)
 	struct uart_port *port = &up->port;
 
 	del_timer_sync(&up->timer);
-	up->timer.function = (TIMER_FUNC_TYPE)serial8250_timeout;
+	up->timer.function = serial8250_timeout;
 	if (port->irq)
 		serial_unlink_irq_chain(up);
 }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index dc850d257ea2..c1961761311d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -118,7 +118,7 @@ struct kthread_delayed_work {
 
 #define KTHREAD_DELAYED_WORK_INIT(dwork, fn) {				\
 	.work = KTHREAD_WORK_INIT((dwork).work, (fn)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(kthread_delayed_work_timer_fn,\
 				     TIMER_IRQSAFE),			\
 	}
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index b1ae64b112c2..04af640ea95b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -63,9 +63,6 @@ struct timer_list {
 
 #define TIMER_TRACE_FLAGMASK	(TIMER_MIGRATING | TIMER_DEFERRABLE | TIMER_PINNED | TIMER_IRQSAFE)
 
-#define TIMER_DATA_TYPE		struct timer_list *
-#define TIMER_FUNC_TYPE		void (*)(TIMER_DATA_TYPE)
-
 #define __TIMER_INITIALIZER(_function, _flags) {		\
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -76,7 +73,7 @@ struct timer_list {
 
 #define DEFINE_TIMER(_name, _function)				\
 	struct timer_list _name =				\
-		__TIMER_INITIALIZER((TIMER_FUNC_TYPE)_function, 0)
+		__TIMER_INITIALIZER(_function, 0)
 
 /*
  * LOCKDEP and DEBUG timer interfaces.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bff39faba793..4a54ef96aff5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -176,7 +176,7 @@ struct execute_work {
 
 #define __DELAYED_WORK_INITIALIZER(n, f, tflags) {			\
 	.work = __WORK_INITIALIZER((n).work, (f)),			\
-	.timer = __TIMER_INITIALIZER((TIMER_FUNC_TYPE)delayed_work_timer_fn,\
+	.timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
 				     (tflags) | TIMER_IRQSAFE),		\
 	}
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8af313081b0d..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -843,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dde6298f6b22..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c976196da3ea..6676e3433261 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1798,7 +1798,7 @@ static struct atm_vcc *lec_arp_resolve(struct lec_priv *priv,
 		else
 			send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL);
 		entry->timer.expires = jiffies + (1 * HZ);
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_arp;
+		entry->timer.function = lec_arp_expire_arp;
 		add_timer(&entry->timer);
 		found = priv->mcast_vcc;
 	}
@@ -1998,7 +1998,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 		entry->old_recv_push = old_push;
 		entry->status = ESI_UNKNOWN;
 		entry->timer.expires = jiffies + priv->vcc_timeout_period;
-		entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+		entry->timer.function = lec_arp_expire_vcc;
 		hlist_add_head(&entry->next, &priv->lec_no_forward);
 		add_timer(&entry->timer);
 		dump_arp_table(priv);
@@ -2082,7 +2082,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
 	entry->status = ESI_UNKNOWN;
 	hlist_add_head(&entry->next, &priv->lec_arp_empty_ones);
 	entry->timer.expires = jiffies + priv->vcc_timeout_period;
-	entry->timer.function = (TIMER_FUNC_TYPE)lec_arp_expire_vcc;
+	entry->timer.function = lec_arp_expire_vcc;
 	add_timer(&entry->timer);
 	pr_debug("After vcc was added\n");
 	dump_arp_table(priv);
diff --git a/net/can/proc.c b/net/can/proc.c
index d979b3dc49a6..0c59f876fe6f 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -221,7 +221,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
 
 	seq_putc(m, '\n');
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
 				can_stats->total_rx_match_ratio);
 
@@ -291,7 +291,7 @@ static int can_reset_stats_proc_show(struct seq_file *m, void *v)
 
 	user_reset = 1;
 
-	if (net->can.can_stattimer.function == (TIMER_FUNC_TYPE)can_stat_update) {
+	if (net->can.can_stattimer.function == can_stat_update) {
 		seq_printf(m, "Scheduled statistic reset #%ld.\n",
 				can_pstats->stats_reset + 1);
 	} else {
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 8bb469cb3abe..5d4ae01951b5 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -42,7 +42,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t1timer);
 
-	lapb->t1timer.function = (TIMER_FUNC_TYPE)lapb_t1timer_expiry;
+	lapb->t1timer.function = lapb_t1timer_expiry;
 	lapb->t1timer.expires  = jiffies + lapb->t1;
 
 	add_timer(&lapb->t1timer);
@@ -52,7 +52,7 @@ void lapb_start_t2timer(struct lapb_cb *lapb)
 {
 	del_timer(&lapb->t2timer);
 
-	lapb->t2timer.function = (TIMER_FUNC_TYPE)lapb_t2timer_expiry;
+	lapb->t2timer.function = lapb_t2timer_expiry;
 	lapb->t2timer.expires  = jiffies + lapb->t2;
 
 	add_timer(&lapb->t2timer);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 2dec3583c97d..7ed9d4422a73 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -284,7 +284,7 @@ void nr_destroy_socket(struct sock *sk)
 
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_destroy_timer;
+		sk->sk_timer.function = nr_destroy_timer;
 		sk->sk_timer.expires  = jiffies + 2 * HZ;
 		add_timer(&sk->sk_timer);
 	} else
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 43569aea0f5e..cbd51ed5a2d7 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -45,7 +45,7 @@ void nr_init_timers(struct sock *sk)
 	timer_setup(&nr->idletimer, nr_idletimer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)nr_heartbeat_expiry;
+	sk->sk_timer.function = nr_heartbeat_expiry;
 }
 
 void nr_start_t1timer(struct sock *sk)
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index cda4c6678ef1..62055d3069d2 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -37,7 +37,7 @@ void rose_start_ftimer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->ftimer);
 
-	neigh->ftimer.function = (TIMER_FUNC_TYPE)rose_ftimer_expiry;
+	neigh->ftimer.function = rose_ftimer_expiry;
 	neigh->ftimer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_link_fail_timeout);
 
@@ -48,7 +48,7 @@ static void rose_start_t0timer(struct rose_neigh *neigh)
 {
 	del_timer(&neigh->t0timer);
 
-	neigh->t0timer.function = (TIMER_FUNC_TYPE)rose_t0timer_expiry;
+	neigh->t0timer.function = rose_t0timer_expiry;
 	neigh->t0timer.expires  =
 		jiffies + msecs_to_jiffies(sysctl_rose_restart_request_timeout);
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index ea613b2a9735..74555fb95615 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -36,7 +36,7 @@ void rose_start_heartbeat(struct sock *sk)
 {
 	del_timer(&sk->sk_timer);
 
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)rose_heartbeat_expiry;
+	sk->sk_timer.function = rose_heartbeat_expiry;
 	sk->sk_timer.expires  = jiffies + 5 * HZ;
 
 	add_timer(&sk->sk_timer);
@@ -48,7 +48,7 @@ void rose_start_t1timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t1;
 
 	add_timer(&rose->timer);
@@ -60,7 +60,7 @@ void rose_start_t2timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t2;
 
 	add_timer(&rose->timer);
@@ -72,7 +72,7 @@ void rose_start_t3timer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->t3;
 
 	add_timer(&rose->timer);
@@ -84,7 +84,7 @@ void rose_start_hbtimer(struct sock *sk)
 
 	del_timer(&rose->timer);
 
-	rose->timer.function = (TIMER_FUNC_TYPE)rose_timer_expiry;
+	rose->timer.function = rose_timer_expiry;
 	rose->timer.expires  = jiffies + rose->hb;
 
 	add_timer(&rose->timer);
@@ -97,7 +97,7 @@ void rose_start_idletimer(struct sock *sk)
 	del_timer(&rose->idletimer);
 
 	if (rose->idle > 0) {
-		rose->idletimer.function = (TIMER_FUNC_TYPE)rose_idletimer_expiry;
+		rose->idletimer.function = rose_idletimer_expiry;
 		rose->idletimer.expires  = jiffies + rose->idle;
 
 		add_timer(&rose->idletimer);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e8e0831229cf..f9307bd6644b 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -745,7 +745,7 @@ static void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt
 	serv->sv_tmpcnt++;
 	if (serv->sv_temptimer.function == NULL) {
 		/* setup timer to age temp transports */
-		serv->sv_temptimer.function = (TIMER_FUNC_TYPE)svc_age_temp_xprts;
+		serv->sv_temptimer.function = svc_age_temp_xprts;
 		mod_timer(&serv->sv_temptimer,
 			  jiffies + svc_conn_age_period * HZ);
 	}
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index ea87143314f3..562cc11131f6 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -415,7 +415,7 @@ static void __x25_destroy_socket(struct sock *sk)
 	if (sk_has_allocations(sk)) {
 		/* Defer: outstanding buffers */
 		sk->sk_timer.expires  = jiffies + 10 * HZ;
-		sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_destroy_timer;
+		sk->sk_timer.function = x25_destroy_timer;
 		add_timer(&sk->sk_timer);
 	} else {
 		/* drop last reference so sock_put will free */
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index 1dfba3c23459..fa3461002b3e 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -36,7 +36,7 @@ void x25_init_timers(struct sock *sk)
 	timer_setup(&x25->timer, x25_timer_expiry, 0);
 
 	/* initialized by sock_init_data */
-	sk->sk_timer.function = (TIMER_FUNC_TYPE)x25_heartbeat_expiry;
+	sk->sk_timer.function = x25_heartbeat_expiry;
 }
 
 void x25_start_heartbeat(struct sock *sk)
diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c
index 4f9613e5fc9e..c1376bfdc90b 100644
--- a/sound/usb/line6/driver.c
+++ b/sound/usb/line6/driver.c
@@ -201,7 +201,7 @@ static int line6_send_raw_message_async_part(struct message *msg,
 void line6_start_timer(struct timer_list *timer, unsigned long msecs,
 		       void (*function)(struct timer_list *t))
 {
-	timer->function = (TIMER_FUNC_TYPE)function;
+	timer->function = function;
 	mod_timer(timer, jiffies + msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL_GPL(line6_start_timer);
-- 
cgit v1.3-14-g43fede


From db1ac4964fa172803a0fea83033cd35d380a8a77 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:53 +0000
Subject: bpf: introduce ARG_PTR_TO_MEM_OR_NULL

With the current ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM semantics, an helper
argument can be NULL when the next argument type is ARG_CONST_SIZE_OR_ZERO
and the verifier can prove the value of this next argument is 0. However,
most helpers are just interested in handling <!NULL, 0>, so forcing them to
deal with <NULL, 0> makes the implementation of those helpers more
complicated for no apparent benefits, requiring them to explicitly handle
those corner cases with checks that bpf programs could start relying upon,
preventing the possibility of removing them later.

Solve this by making ARG_PTR_TO_MEM/ARG_PTR_TO_UNINIT_MEM never accept NULL
even when ARG_CONST_SIZE_OR_ZERO is set, and introduce a new argument type
ARG_PTR_TO_MEM_OR_NULL to explicitly deal with the NULL case.

Currently, the only helper that needs this is bpf_csum_diff_proto(), so
change arg1 and arg3 to this new type as well.

Also add a new battery of tests that explicitly test the
!ARG_PTR_TO_MEM_OR_NULL combination: all the current ones testing the
various <NULL, 0> variations are focused on bpf_csum_diff, so cover also
other helpers.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h                         |   1 +
 kernel/bpf/verifier.c                       |   4 +-
 net/core/filter.c                           |   4 +-
 tools/testing/selftests/bpf/test_verifier.c | 113 ++++++++++++++++++++++++++--
 4 files changed, 112 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 76c577281d78..e55e4255a210 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -78,6 +78,7 @@ enum bpf_arg_type {
 	 * functions that access data on eBPF program stack
 	 */
 	ARG_PTR_TO_MEM,		/* pointer to valid memory (stack, packet, map value) */
+	ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
 	ARG_PTR_TO_UNINIT_MEM,	/* pointer to memory does not need to be initialized,
 				 * helper function must fill all bytes or clear
 				 * them in error case.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index dd54d20ace2f..308b0638ec5d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1384,13 +1384,15 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		if (type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
 		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
 		 * happens during stack boundary checking.
 		 */
-		if (register_is_null(*reg))
+		if (register_is_null(*reg) &&
+		    arg_type == ARG_PTR_TO_MEM_OR_NULL)
 			/* final test in check_stack_boundary() */;
 		else if (!type_is_pkt_pointer(type) &&
 			 type != PTR_TO_MAP_VALUE &&
diff --git a/net/core/filter.c b/net/core/filter.c
index 1afa17935954..6a85e67fafce 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1646,9 +1646,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
 	.gpl_only	= false,
 	.pkt_access	= true,
 	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_MEM,
+	.arg1_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
-	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_PTR_TO_MEM_OR_NULL,
 	.arg4_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg5_type	= ARG_ANYTHING,
 };
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 2a5267bef160..3c64f30cf63c 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -5631,7 +5631,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on NULL",
+		"helper access to variable memory: size = 0 allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5645,7 +5645,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size > 0 not allowed on NULL",
+		"helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_IMM(BPF_REG_1, 0),
 			BPF_MOV64_IMM(BPF_REG_2, 0),
@@ -5663,7 +5663,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
 			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
@@ -5680,7 +5680,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5702,7 +5702,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5727,7 +5727,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
 			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
@@ -5750,7 +5750,7 @@ static struct bpf_test tests[] = {
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
 	{
-		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer",
+		"helper access to variable memory: size possible = 0 allowed on != NULL packet pointer (ARG_PTR_TO_MEM_OR_NULL)",
 		.insns = {
 			BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
 				    offsetof(struct __sk_buff, data)),
@@ -5771,6 +5771,105 @@ static struct bpf_test tests[] = {
 		.result = ACCEPT,
 		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
 	},
+	{
+		"helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size > 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_IMM(BPF_REG_2, 1),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.errstr = "R1 type=inv expected=fp",
+		.result = REJECT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_2, 0),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
+	{
+		"helper access to variable memory: size possible = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+		.insns = {
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+			BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+			BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2),
+			BPF_MOV64_IMM(BPF_REG_3, 0),
+			BPF_EMIT_CALL(BPF_FUNC_probe_read),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map1 = { 3 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 	{
 		"helper access to variable memory: 8 bytes leak",
 		.insns = {
-- 
cgit v1.3-14-g43fede


From eb33f2cca49ec49a1b893b5af546e7c042ca6365 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:54 +0000
Subject: bpf: remove explicit handling of 0 for arg2 in bpf_probe_read

Commit 9c019e2bc4b2 ("bpf: change helper bpf_probe_read arg2 type to
ARG_CONST_SIZE_OR_ZERO") changed arg2 type to ARG_CONST_SIZE_OR_ZERO to
simplify writing bpf programs by taking advantage of the new semantics
introduced for ARG_CONST_SIZE_OR_ZERO which allows <!NULL, 0> arguments.

In order to prevent the helper from actually passing a NULL pointer to
probe_kernel_read, which can happen when <NULL, 0> is passed to the helper,
the commit also introduced an explicit check against size == 0.

After the recent introduction of the ARG_PTR_TO_MEM_OR_NULL type,
bpf_probe_read can not receive a pair of <NULL, 0> arguments anymore, thus
the check is not needed anymore and can be removed, since probe_kernel_read
can correctly handle a <!NULL, 0> call. This also fixes the semantics of
the helper before it gets officially released and bpf programs start
relying on this check.

Fixes: 9c019e2bc4b2 ("bpf: change helper bpf_probe_read arg2 type to ARG_CONST_SIZE_OR_ZERO")
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index a5580c670866..728909f7951c 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -78,16 +78,12 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
 
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
-	int ret = 0;
-
-	if (unlikely(size == 0))
-		goto out;
+	int ret;
 
 	ret = probe_kernel_read(dst, unsafe_ptr, size);
 	if (unlikely(ret < 0))
 		memset(dst, 0, size);
 
- out:
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From 5c4e1201740ceae9bd6f622851a9bf7c66debe3a Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:55 +0000
Subject: bpf: change bpf_probe_read_str arg2 type to ARG_CONST_SIZE_OR_ZERO

Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_probe_read_str helper when operating on variable
memory:

/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
        bpf_probe_read_str(p, len, s);

251: (79) r1 = *(u64 *)(r10 -88)
252: (07) r1 += -1
253: (25) if r1 > 0x7ffe goto pc-42
254: (bf) r1 = r7
255: (79) r2 = *(u64 *)(r10 -88)
256: (bf) r8 = r4
257: (85) call bpf_probe_read_str#45
R2 min value is negative, either use unsigned or 'var &= const'

With this code, the verifier loses track of the variable.

Replacing arg2 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:

if (len <= 0x7fff)
        bpf_probe_read_str(p, len, s);

or

bpf_probe_read_str(p, len & 0x7fff, s);

No changes to the bpf_probe_read_str helper are necessary since
strncpy_from_unsafe itself immediately returns if the size passed is 0.

Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 728909f7951c..ed8601a1a861 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -494,7 +494,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
 };
 
-- 
cgit v1.3-14-g43fede


From a60dd35d2e39209fa7645945e1192bf9769872c6 Mon Sep 17 00:00:00 2001
From: Gianluca Borello <g.borello@gmail.com>
Date: Wed, 22 Nov 2017 18:32:56 +0000
Subject: bpf: change bpf_perf_event_output arg5 type to ARG_CONST_SIZE_OR_ZERO

Commit 9fd29c08e520 ("bpf: improve verifier ARG_CONST_SIZE_OR_ZERO
semantics") relaxed the treatment of ARG_CONST_SIZE_OR_ZERO due to the way
the compiler generates optimized BPF code when checking boundaries of an
argument from C code. A typical example of this optimized code can be
generated using the bpf_perf_event_output helper when operating on variable
memory:

/* len is a generic scalar */
if (len > 0 && len <= 0x7fff)
        bpf_perf_event_output(ctx, &perf_map, 0, buf, len);

110: (79) r5 = *(u64 *)(r10 -40)
111: (bf) r1 = r5
112: (07) r1 += -1
113: (25) if r1 > 0x7ffe goto pc+6
114: (bf) r1 = r6
115: (18) r2 = 0xffff94e5f166c200
117: (b7) r3 = 0
118: (bf) r4 = r7
119: (85) call bpf_perf_event_output#25
R5 min value is negative, either use unsigned or 'var &= const'

With this code, the verifier loses track of the variable.

Replacing arg5 with ARG_CONST_SIZE_OR_ZERO is thus desirable since it
avoids this quite common case which leads to usability issues, and the
compiler generates code that the verifier can more easily test:

if (len <= 0x7fff)
        bpf_perf_event_output(ctx, &perf_map, 0, buf, len);

or

bpf_perf_event_output(ctx, &perf_map, 0, buf, len & 0x7fff);

No changes to the bpf_perf_event_output helper are necessary since it can
handle a case where size is 0, and an empty frame is pushed.

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Gianluca Borello <g.borello@gmail.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 kernel/trace/bpf_trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ed8601a1a861..27d1f4ffa3de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -403,7 +403,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -605,7 +605,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
-- 
cgit v1.3-14-g43fede


From c131187db2d3fa2f8bf32fdf4e9a4ef805168467 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 22 Nov 2017 16:42:05 -0800
Subject: bpf: fix branch pruning logic

when the verifier detects that register contains a runtime constant
and it's compared with another constant it will prune exploration
of the branch that is guaranteed not to be taken at runtime.
This is all correct, but malicious program may be constructed
in such a way that it always has a constant comparison and
the other branch is never taken under any conditions.
In this case such path through the program will not be explored
by the verifier. It won't be taken at run-time either, but since
all instructions are JITed the malicious program may cause JITs
to complain about using reserved fields, etc.
To fix the issue we have to track the instructions explored by
the verifier and sanitize instructions that are dead at run time
with NOPs. We cannot reject such dead code, since llvm generates
it for valid C code, since it doesn't do as much data flow
analysis as the verifier does.

Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index b61482d354a2..c561b986bab0 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -115,7 +115,7 @@ struct bpf_insn_aux_data {
 		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
-	int converted_op_size; /* the valid value width after perceived conversion */
+	bool seen; /* this insn was processed by the verifier */
 };
 
 #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 308b0638ec5d..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3827,6 +3827,7 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 
 		regs = cur_regs(env);
+		env->insn_aux_data[insn_idx].seen = true;
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -4022,6 +4023,7 @@ process_bpf_exit:
 					return err;
 
 				insn_idx++;
+				env->insn_aux_data[insn_idx].seen = true;
 			} else {
 				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
@@ -4204,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 				u32 off, u32 cnt)
 {
 	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+	int i;
 
 	if (cnt == 1)
 		return 0;
@@ -4213,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
 	memcpy(new_data + off + cnt - 1, old_data + off,
 	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+	for (i = off; i < off + cnt - 1; i++)
+		new_data[i].seen = true;
 	env->insn_aux_data = new_data;
 	vfree(old_data);
 	return 0;
@@ -4231,6 +4236,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	return new_prog;
 }
 
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+	struct bpf_insn *insn = env->prog->insnsi;
+	const int insn_cnt = env->prog->len;
+	int i;
+
+	for (i = 0; i < insn_cnt; i++) {
+		if (aux_data[i].seen)
+			continue;
+		memcpy(insn + i, &nop, sizeof(nop));
+	}
+}
+
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
@@ -4557,6 +4581,9 @@ skip_full_check:
 	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
+	if (ret == 0)
+		sanitize_dead_code(env);
+
 	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
 		ret = convert_ctx_accesses(env);
-- 
cgit v1.3-14-g43fede


From 75f1133873d6a1276d3c19918b7c94975840f990 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 22 Nov 2017 12:56:45 -0800
Subject: genirq/matrix: Make - vs ?: Precedence explicit

Noticed with a Clang build. This improves the readability of the ?:
expression, as it has lower precedence than the - expression. Show
explicitly that - is evaluated first.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20171122205645.GA27125@beast
---
 kernel/irq/matrix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index a3cbbc8191c5..7df2480005f8 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -384,7 +384,7 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
 {
 	struct cpumap *cm = this_cpu_ptr(m->maps);
 
-	return m->global_available - cpudown ? cm->available : 0;
+	return (m->global_available - cpudown) ? cm->available : 0;
 }
 
 /**
-- 
cgit v1.3-14-g43fede