From aff7385b5a16bca6b8d9243f01a9ea5a5b411e1d Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Tue, 21 Jan 2014 15:35:53 -0800
Subject: locking/mutexes/mcs: Correct barrier usage

This patch corrects the way memory barriers are used in the MCS lock
with smp_load_acquire and smp_store_release fucnctions.  The previous
barriers could leak critical sections if mcs lock is used by itself.
It is not a problem when mcs lock is embedded in mutex but will be an
issue when the mcs_lock is used elsewhere.

The patch removes the incorrect barriers and put in correct
barriers with the pair of functions smp_load_acquire and smp_store_release.

Suggested-by: Michel Lespinasse <walken@google.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1390347353.3138.62.camel@schen9-DESK
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..fbbd2eda867e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -136,9 +136,12 @@ void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
 		return;
 	}
 	ACCESS_ONCE(prev->next) = node;
-	smp_wmb();
-	/* Wait until the lock holder passes the lock down */
-	while (!ACCESS_ONCE(node->locked))
+	/*
+	 * Wait until the lock holder passes the lock down.
+	 * Using smp_load_acquire() provides a memory barrier that
+	 * ensures subsequent operations happen after the lock is acquired.
+	 */
+	while (!(smp_load_acquire(&node->locked)))
 		arch_mutex_cpu_relax();
 }
 
@@ -156,8 +159,13 @@ static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
 		while (!(next = ACCESS_ONCE(node->next)))
 			arch_mutex_cpu_relax();
 	}
-	ACCESS_ONCE(next->locked) = 1;
-	smp_wmb();
+	/*
+	 * Pass lock to next waiter.
+	 * smp_store_release() provides a memory barrier to ensure
+	 * all operations in the critical section has been completed
+	 * before unlocking.
+	 */
+	smp_store_release(&next->locked, 1);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From e72246748ff006ab928bc774e276e6ef5542f9c5 Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Tue, 21 Jan 2014 15:36:00 -0800
Subject: locking/mutexes/mcs: Restructure the MCS lock defines and locking
 code into its own file

We will need the MCS lock code for doing optimistic spinning for rwsem
and queued rwlock.  Extracting the MCS code from mutex.c and put into
its own file allow us to reuse this code easily.

We also inline mcs_spin_lock and mcs_spin_unlock functions
for better efficiency.

Note that using the smp_load_acquire/smp_store_release pair used in
mcs_lock and mcs_unlock is not sufficient to form a full memory barrier
across cpus for many architectures (except x86).  For applications that
absolutely need a full barrier across multiple cpus with mcs_unlock and
mcs_lock pair, smp_mb__after_unlock_lock() should be used after mcs_lock.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1390347360.3138.63.camel@schen9-DESK
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mcs_spinlock.h | 77 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mutex.h        |  5 +--
 kernel/locking/mutex.c       | 68 ++++----------------------------------
 3 files changed, 87 insertions(+), 63 deletions(-)
 create mode 100644 include/linux/mcs_spinlock.h

(limited to 'kernel')

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
new file mode 100644
index 000000000000..9578ef81940b
--- /dev/null
+++ b/include/linux/mcs_spinlock.h
@@ -0,0 +1,77 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+struct mcs_spinlock {
+	struct mcs_spinlock *next;
+	int locked; /* 1 if lock acquired */
+};
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+	struct mcs_spinlock *prev;
+
+	/* Init node */
+	node->locked = 0;
+	node->next   = NULL;
+
+	prev = xchg(lock, node);
+	if (likely(prev == NULL)) {
+		/* Lock acquired */
+		node->locked = 1;
+		return;
+	}
+	ACCESS_ONCE(prev->next) = node;
+	/*
+	 * Wait until the lock holder passes the lock down.
+	 * Using smp_load_acquire() provides a memory barrier that
+	 * ensures subsequent operations happen after the lock is acquired.
+	 */
+	while (!(smp_load_acquire(&node->locked)))
+		arch_mutex_cpu_relax();
+}
+
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+	struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+	if (likely(!next)) {
+		/*
+		 * Release the lock by setting it to NULL
+		 */
+		if (cmpxchg(lock, node, NULL) == node)
+			return;
+		/* Wait until the next pointer is set */
+		while (!(next = ACCESS_ONCE(node->next)))
+			arch_mutex_cpu_relax();
+	}
+	/*
+	 * Pass lock to next waiter.
+	 * smp_store_release() provides a memory barrier to ensure
+	 * all operations in the critical section has been completed
+	 * before unlocking.
+	 */
+	smp_store_release(&next->locked, 1);
+}
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index d3181936c138..c482e1d2cc49 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,6 +46,7 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
+struct mcs_spinlock;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -55,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	void			*spin_mlock;	/* Spinner MCS lock */
+	struct mcs_spinlock	*mcs_lock;	/* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
@@ -179,4 +180,4 @@ extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 # define arch_mutex_cpu_relax() cpu_relax()
 #endif
 
-#endif
+#endif /* __LINUX_MUTEX_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index fbbd2eda867e..45fe1b5293d6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
+#include <linux/mcs_spinlock.h>
 
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -52,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	INIT_LIST_HEAD(&lock->wait_list);
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	lock->spin_mlock = NULL;
+	lock->mcs_lock = NULL;
 #endif
 
 	debug_mutex_init(lock, name, key);
@@ -111,62 +112,7 @@ EXPORT_SYMBOL(mutex_lock);
  * more or less simultaneously, the spinners need to acquire a MCS lock
  * first before spinning on the owner field.
  *
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
  */
-struct mspin_node {
-	struct mspin_node *next ;
-	int		  locked;	/* 1 if lock acquired */
-};
-#define	MLOCK(mutex)	((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
-	struct mspin_node *prev;
-
-	/* Init node */
-	node->locked = 0;
-	node->next   = NULL;
-
-	prev = xchg(lock, node);
-	if (likely(prev == NULL)) {
-		/* Lock acquired */
-		node->locked = 1;
-		return;
-	}
-	ACCESS_ONCE(prev->next) = node;
-	/*
-	 * Wait until the lock holder passes the lock down.
-	 * Using smp_load_acquire() provides a memory barrier that
-	 * ensures subsequent operations happen after the lock is acquired.
-	 */
-	while (!(smp_load_acquire(&node->locked)))
-		arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
-	struct mspin_node *next = ACCESS_ONCE(node->next);
-
-	if (likely(!next)) {
-		/*
-		 * Release the lock by setting it to NULL
-		 */
-		if (cmpxchg(lock, node, NULL) == node)
-			return;
-		/* Wait until the next pointer is set */
-		while (!(next = ACCESS_ONCE(node->next)))
-			arch_mutex_cpu_relax();
-	}
-	/*
-	 * Pass lock to next waiter.
-	 * smp_store_release() provides a memory barrier to ensure
-	 * all operations in the critical section has been completed
-	 * before unlocking.
-	 */
-	smp_store_release(&next->locked, 1);
-}
 
 /*
  * Mutex spinning code migrated from kernel/sched/core.c
@@ -456,7 +402,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 	for (;;) {
 		struct task_struct *owner;
-		struct mspin_node  node;
+		struct mcs_spinlock  node;
 
 		if (use_ww_ctx && ww_ctx->acquired > 0) {
 			struct ww_mutex *ww;
@@ -478,10 +424,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		mspin_lock(MLOCK(lock), &node);
+		mcs_spin_lock(&lock->mcs_lock, &node);
 		owner = ACCESS_ONCE(lock->owner);
 		if (owner && !mutex_spin_on_owner(lock, owner)) {
-			mspin_unlock(MLOCK(lock), &node);
+			mcs_spin_unlock(&lock->mcs_lock, &node);
 			goto slowpath;
 		}
 
@@ -496,11 +442,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			}
 
 			mutex_set_owner(lock);
-			mspin_unlock(MLOCK(lock), &node);
+			mcs_spin_unlock(&lock->mcs_lock, &node);
 			preempt_enable();
 			return 0;
 		}
-		mspin_unlock(MLOCK(lock), &node);
+		mcs_spin_unlock(&lock->mcs_lock, &node);
 
 		/*
 		 * When there's no owner, we might have preempted between the
-- 
cgit v1.3-14-g43fede


From 52bf84aa206cd2c2516dfa3e03b578edf8a3242f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:40 -0500
Subject: sched/numa, mm: Remove p->numa_migrate_deferred

Excessive migration of pages can hurt the performance of workloads
that span multiple NUMA nodes.  However, it turns out that the
p->numa_migrate_deferred knob is a really big hammer, which does
reduce migration rates, but does not actually help performance.

Now that the second stage of the automatic numa balancing code
has stabilized, it is time to replace the simplistic migration
deferral code with something smarter.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-2-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/sysctl/kernel.txt | 10 +--------
 include/linux/sched.h           |  1 -
 kernel/sched/fair.c             |  8 --------
 kernel/sysctl.c                 |  7 -------
 mm/mempolicy.c                  | 45 -----------------------------------------
 5 files changed, 1 insertion(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 6d486404200e..760f6e6a2e40 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -386,8 +386,7 @@ feature should be disabled. Otherwise, if the system overhead from the
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
-numa_balancing_migrate_deferred.
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
 
 ==============================================================
 
@@ -428,13 +427,6 @@ rate for each task.
 numa_balancing_scan_size_mb is how many megabytes worth of pages are
 scanned for a given scan.
 
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
-
 ==============================================================
 
 osrelease, ostype & version:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffccdad050b5..d572d5ba650f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1457,7 +1457,6 @@ struct task_struct {
 	unsigned int numa_scan_period;
 	unsigned int numa_scan_period_max;
 	int numa_preferred_nid;
-	int numa_migrate_deferred;
 	unsigned long numa_migrate_retry;
 	u64 node_stamp;			/* migration stamp  */
 	struct callback_head numa_work;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index efe6457ac5c8..7cdde913b4dc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -819,14 +819,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
 
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
 	unsigned long rss = 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8da99f905cf..b41d61d95c14 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -384,13 +384,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname       = "numa_balancing_migrate_deferred",
-		.data           = &sysctl_numa_balancing_migrate_deferred,
-		.maxlen         = sizeof(unsigned int),
-		.mode           = 0644,
-		.proc_handler   = proc_dointvec,
-	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
 	{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0cd2c4d4e270..68d5c7f7164e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2304,35 +2304,6 @@ static void sp_free(struct sp_node *n)
 	kmem_cache_free(sn_cache, n);
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-	/* Never defer a private fault */
-	if (cpupid_match_pid(p, last_cpupid))
-		return false;
-
-	if (p->numa_migrate_deferred) {
-		p->numa_migrate_deferred--;
-		return true;
-	}
-	return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-	p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-	return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 /**
  * mpol_misplaced - check whether current page node is valid in policy
  *
@@ -2435,24 +2406,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 */
 		last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
 		if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
-
-			/* See sysctl_numa_balancing_migrate_deferred comment */
-			if (!cpupid_match_pid(current, last_cpupid))
-				defer_numa_migrate(current);
-
 			goto out;
 		}
-
-		/*
-		 * The quadratic filter above reduces extraneous migration
-		 * of shared pages somewhat. This code reduces it even more,
-		 * reducing the overhead of page migrations of shared pages.
-		 * This makes workloads with shared pages rely more on
-		 * "move task near its memory", and less on "move memory
-		 * towards its task", which is exactly what we want.
-		 */
-		if (numa_migrate_deferred(current, last_cpupid))
-			goto out;
 	}
 
 	if (curnid != polnid)
-- 
cgit v1.3-14-g43fede


From ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:41 -0500
Subject: sched/numa: Rename p->numa_faults to numa_faults_memory

In order to get a more consistent naming scheme, making it clear
which fault statistics track memory locality, and which track
CPU locality, rename the memory fault statistics.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-3-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  8 ++++----
 kernel/sched/core.c   |  4 ++--
 kernel/sched/debug.c  |  6 +++---
 kernel/sched/fair.c   | 56 +++++++++++++++++++++++++--------------------------
 4 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d572d5ba650f..144d509df053 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1469,15 +1469,15 @@ struct task_struct {
 	 * Scheduling placement decisions are made based on the these counts.
 	 * The values remain static for the duration of a PTE scan
 	 */
-	unsigned long *numa_faults;
+	unsigned long *numa_faults_memory;
 	unsigned long total_numa_faults;
 
 	/*
 	 * numa_faults_buffer records faults per node during the current
-	 * scan window. When the scan completes, the counts in numa_faults
-	 * decay and these values are copied.
+	 * scan window. When the scan completes, the counts in
+	 * numa_faults_memory decay and these values are copied.
 	 */
-	unsigned long *numa_faults_buffer;
+	unsigned long *numa_faults_buffer_memory;
 
 	/*
 	 * numa_faults_locality tracks if faults recorded during the last
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 81343d6bd9cb..bc708c53bf03 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1744,8 +1744,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
 	p->numa_scan_period = sysctl_numa_balancing_scan_delay;
 	p->numa_work.next = &p->numa_work;
-	p->numa_faults = NULL;
-	p->numa_faults_buffer = NULL;
+	p->numa_faults_memory = NULL;
+	p->numa_faults_buffer_memory = NULL;
 
 	INIT_LIST_HEAD(&p->numa_entry);
 	p->numa_group = NULL;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..31b908daaa1b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -533,15 +533,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
 			unsigned long nr_faults = -1;
 			int cpu_current, home_node;
 
-			if (p->numa_faults)
-				nr_faults = p->numa_faults[2*node + i];
+			if (p->numa_faults_memory)
+				nr_faults = p->numa_faults_memory[2*node + i];
 
 			cpu_current = !i ? (task_node(p) == node) :
 				(pol && node_isset(node, pol->v.nodes));
 
 			home_node = (p->numa_preferred_nid == node);
 
-			SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+			SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
 				i, node, cpu_current, home_node, nr_faults);
 		}
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7cdde913b4dc..3e616d704f67 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -901,11 +901,11 @@ static inline int task_faults_idx(int nid, int priv)
 
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
-	if (!p->numa_faults)
+	if (!p->numa_faults_memory)
 		return 0;
 
-	return p->numa_faults[task_faults_idx(nid, 0)] +
-		p->numa_faults[task_faults_idx(nid, 1)];
+	return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+		p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 
 static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -927,7 +927,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
 {
 	unsigned long total_faults;
 
-	if (!p->numa_faults)
+	if (!p->numa_faults_memory)
 		return 0;
 
 	total_faults = p->total_numa_faults;
@@ -1255,7 +1255,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
 	/* This task has no NUMA fault statistics yet */
-	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
 		return;
 
 	/* Periodically retry migrating the task to the preferred node */
@@ -1371,16 +1371,16 @@ static void task_numa_placement(struct task_struct *p)
 			long diff;
 
 			i = task_faults_idx(nid, priv);
-			diff = -p->numa_faults[i];
+			diff = -p->numa_faults_memory[i];
 
 			/* Decay existing window, copy faults since last scan */
-			p->numa_faults[i] >>= 1;
-			p->numa_faults[i] += p->numa_faults_buffer[i];
-			fault_types[priv] += p->numa_faults_buffer[i];
-			p->numa_faults_buffer[i] = 0;
+			p->numa_faults_memory[i] >>= 1;
+			p->numa_faults_memory[i] += p->numa_faults_buffer_memory[i];
+			fault_types[priv] += p->numa_faults_buffer_memory[i];
+			p->numa_faults_buffer_memory[i] = 0;
 
-			faults += p->numa_faults[i];
-			diff += p->numa_faults[i];
+			faults += p->numa_faults_memory[i];
+			diff += p->numa_faults_memory[i];
 			p->total_numa_faults += diff;
 			if (p->numa_group) {
 				/* safe because we can only change our own group */
@@ -1465,7 +1465,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 		grp->gid = p->pid;
 
 		for (i = 0; i < 2*nr_node_ids; i++)
-			grp->faults[i] = p->numa_faults[i];
+			grp->faults[i] = p->numa_faults_memory[i];
 
 		grp->total_faults = p->total_numa_faults;
 
@@ -1523,8 +1523,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 	double_lock(&my_grp->lock, &grp->lock);
 
 	for (i = 0; i < 2*nr_node_ids; i++) {
-		my_grp->faults[i] -= p->numa_faults[i];
-		grp->faults[i] += p->numa_faults[i];
+		my_grp->faults[i] -= p->numa_faults_memory[i];
+		grp->faults[i] += p->numa_faults_memory[i];
 	}
 	my_grp->total_faults -= p->total_numa_faults;
 	grp->total_faults += p->total_numa_faults;
@@ -1550,12 +1550,12 @@ void task_numa_free(struct task_struct *p)
 {
 	struct numa_group *grp = p->numa_group;
 	int i;
-	void *numa_faults = p->numa_faults;
+	void *numa_faults = p->numa_faults_memory;
 
 	if (grp) {
 		spin_lock(&grp->lock);
 		for (i = 0; i < 2*nr_node_ids; i++)
-			grp->faults[i] -= p->numa_faults[i];
+			grp->faults[i] -= p->numa_faults_memory[i];
 		grp->total_faults -= p->total_numa_faults;
 
 		list_del(&p->numa_entry);
@@ -1565,8 +1565,8 @@ void task_numa_free(struct task_struct *p)
 		put_numa_group(grp);
 	}
 
-	p->numa_faults = NULL;
-	p->numa_faults_buffer = NULL;
+	p->numa_faults_memory = NULL;
+	p->numa_faults_buffer_memory = NULL;
 	kfree(numa_faults);
 }
 
@@ -1591,16 +1591,16 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 		return;
 
 	/* Allocate buffer to track faults on a per-node basis */
-	if (unlikely(!p->numa_faults)) {
-		int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+	if (unlikely(!p->numa_faults_memory)) {
+		int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids;
 
 		/* numa_faults and numa_faults_buffer share the allocation */
-		p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
-		if (!p->numa_faults)
+		p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+		if (!p->numa_faults_memory)
 			return;
 
-		BUG_ON(p->numa_faults_buffer);
-		p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+		BUG_ON(p->numa_faults_buffer_memory);
+		p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids);
 		p->total_numa_faults = 0;
 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 	}
@@ -1629,7 +1629,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 	if (migrated)
 		p->numa_pages_migrated += pages;
 
-	p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+	p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages;
 	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 
@@ -4771,7 +4771,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
 	int src_nid, dst_nid;
 
-	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
 	    !(env->sd->flags & SD_NUMA)) {
 		return false;
 	}
@@ -4802,7 +4802,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
 		return false;
 
-	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+	if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
 		return false;
 
 	src_nid = cpu_to_node(env->src_cpu);
-- 
cgit v1.3-14-g43fede


From 50ec8a401fed6d246ab65e6011d61ac91c34af70 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:42 -0500
Subject: sched/numa: Track from which nodes NUMA faults are triggered

Track which nodes NUMA faults are triggered from, in other words
the CPUs on which the NUMA faults happened. This uses a similar
mechanism to what is used to track the memory involved in numa faults.

The next patches use this to build up a bitmap of which nodes a
workload is actively running on.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-4-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  9 +++++++--
 kernel/sched/fair.c   | 30 +++++++++++++++++++++++-------
 2 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 144d509df053..5fb0cfb43ecf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1479,6 +1479,13 @@ struct task_struct {
 	 */
 	unsigned long *numa_faults_buffer_memory;
 
+	/*
+	 * Track the nodes the process was running on when a NUMA hinting
+	 * fault was incurred.
+	 */
+	unsigned long *numa_faults_cpu;
+	unsigned long *numa_faults_buffer_cpu;
+
 	/*
 	 * numa_faults_locality tracks if faults recorded during the last
 	 * scan window were remote/local. The task scan period is adapted
@@ -1582,8 +1589,6 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
-
-extern unsigned int sysctl_numa_balancing_migrate_deferred;
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e616d704f67..4841aaff7394 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -886,6 +886,7 @@ struct numa_group {
 
 	struct rcu_head rcu;
 	unsigned long total_faults;
+	unsigned long *faults_cpu;
 	unsigned long faults[0];
 };
 
@@ -1368,10 +1369,11 @@ static void task_numa_placement(struct task_struct *p)
 		int priv, i;
 
 		for (priv = 0; priv < 2; priv++) {
-			long diff;
+			long diff, f_diff;
 
 			i = task_faults_idx(nid, priv);
 			diff = -p->numa_faults_memory[i];
+			f_diff = -p->numa_faults_cpu[i];
 
 			/* Decay existing window, copy faults since last scan */
 			p->numa_faults_memory[i] >>= 1;
@@ -1379,12 +1381,18 @@ static void task_numa_placement(struct task_struct *p)
 			fault_types[priv] += p->numa_faults_buffer_memory[i];
 			p->numa_faults_buffer_memory[i] = 0;
 
+			p->numa_faults_cpu[i] >>= 1;
+			p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i];
+			p->numa_faults_buffer_cpu[i] = 0;
+
 			faults += p->numa_faults_memory[i];
 			diff += p->numa_faults_memory[i];
+			f_diff += p->numa_faults_cpu[i];
 			p->total_numa_faults += diff;
 			if (p->numa_group) {
 				/* safe because we can only change our own group */
 				p->numa_group->faults[i] += diff;
+				p->numa_group->faults_cpu[i] += f_diff;
 				p->numa_group->total_faults += diff;
 				group_faults += p->numa_group->faults[i];
 			}
@@ -1453,7 +1461,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 
 	if (unlikely(!p->numa_group)) {
 		unsigned int size = sizeof(struct numa_group) +
-				    2*nr_node_ids*sizeof(unsigned long);
+				    4*nr_node_ids*sizeof(unsigned long);
 
 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 		if (!grp)
@@ -1463,8 +1471,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 		spin_lock_init(&grp->lock);
 		INIT_LIST_HEAD(&grp->task_list);
 		grp->gid = p->pid;
+		/* Second half of the array tracks nids where faults happen */
+		grp->faults_cpu = grp->faults + 2 * nr_node_ids;
 
-		for (i = 0; i < 2*nr_node_ids; i++)
+		for (i = 0; i < 4*nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults_memory[i];
 
 		grp->total_faults = p->total_numa_faults;
@@ -1522,7 +1532,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 
 	double_lock(&my_grp->lock, &grp->lock);
 
-	for (i = 0; i < 2*nr_node_ids; i++) {
+	for (i = 0; i < 4*nr_node_ids; i++) {
 		my_grp->faults[i] -= p->numa_faults_memory[i];
 		grp->faults[i] += p->numa_faults_memory[i];
 	}
@@ -1554,7 +1564,7 @@ void task_numa_free(struct task_struct *p)
 
 	if (grp) {
 		spin_lock(&grp->lock);
-		for (i = 0; i < 2*nr_node_ids; i++)
+		for (i = 0; i < 4*nr_node_ids; i++)
 			grp->faults[i] -= p->numa_faults_memory[i];
 		grp->total_faults -= p->total_numa_faults;
 
@@ -1567,6 +1577,8 @@ void task_numa_free(struct task_struct *p)
 
 	p->numa_faults_memory = NULL;
 	p->numa_faults_buffer_memory = NULL;
+	p->numa_faults_cpu= NULL;
+	p->numa_faults_buffer_cpu = NULL;
 	kfree(numa_faults);
 }
 
@@ -1577,6 +1589,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
 	struct task_struct *p = current;
 	bool migrated = flags & TNF_MIGRATED;
+	int this_node = task_node(current);
 	int priv;
 
 	if (!numabalancing_enabled)
@@ -1592,7 +1605,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 
 	/* Allocate buffer to track faults on a per-node basis */
 	if (unlikely(!p->numa_faults_memory)) {
-		int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids;
+		int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids;
 
 		/* numa_faults and numa_faults_buffer share the allocation */
 		p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
@@ -1600,7 +1613,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 			return;
 
 		BUG_ON(p->numa_faults_buffer_memory);
-		p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids);
+		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
 		p->total_numa_faults = 0;
 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 	}
@@ -1630,6 +1645,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 		p->numa_pages_migrated += pages;
 
 	p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages;
+	p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages;
 	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 
-- 
cgit v1.3-14-g43fede


From 20e07dea286a90f096a779706861472d296397c6 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:43 -0500
Subject: sched/numa: Build per numa_group active node mask from
 numa_faults_cpu statistics

The numa_faults_cpu statistics are used to maintain an active_nodes nodemask
per numa_group. This allows us to be smarter about when to do numa migrations.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-5-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4841aaff7394..1ee921f1ec35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -885,6 +885,7 @@ struct numa_group {
 	struct list_head task_list;
 
 	struct rcu_head rcu;
+	nodemask_t active_nodes;
 	unsigned long total_faults;
 	unsigned long *faults_cpu;
 	unsigned long faults[0];
@@ -918,6 +919,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
 		p->numa_group->faults[task_faults_idx(nid, 1)];
 }
 
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+	return group->faults_cpu[task_faults_idx(nid, 0)] +
+		group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
 /*
  * These return the fraction of accesses done by a particular task, or
  * task group, on a particular numa node.  The group weight is given a
@@ -1270,6 +1277,38 @@ static void numa_migrate_preferred(struct task_struct *p)
 	task_numa_migrate(p);
 }
 
+/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+	unsigned long faults, max_faults = 0;
+	int nid;
+
+	for_each_online_node(nid) {
+		faults = group_faults_cpu(numa_group, nid);
+		if (faults > max_faults)
+			max_faults = faults;
+	}
+
+	for_each_online_node(nid) {
+		faults = group_faults_cpu(numa_group, nid);
+		if (!node_isset(nid, numa_group->active_nodes)) {
+			if (faults > max_faults * 6 / 16)
+				node_set(nid, numa_group->active_nodes);
+		} else if (faults < max_faults * 3 / 16)
+			node_clear(nid, numa_group->active_nodes);
+	}
+}
+
 /*
  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
  * increments. The more local the fault statistics are, the higher the scan
@@ -1412,6 +1451,7 @@ static void task_numa_placement(struct task_struct *p)
 	update_task_scan_period(p, fault_types[0], fault_types[1]);
 
 	if (p->numa_group) {
+		update_numa_active_node_mask(p->numa_group);
 		/*
 		 * If the preferred task and group nids are different,
 		 * iterate over the nodes again to find the best place.
@@ -1474,6 +1514,8 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 		/* Second half of the array tracks nids where faults happen */
 		grp->faults_cpu = grp->faults + 2 * nr_node_ids;
 
+		node_set(task_node(current), grp->active_nodes);
+
 		for (i = 0; i < 4*nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults_memory[i];
 
-- 
cgit v1.3-14-g43fede


From 10f39042711ba21773763f267b4943a2c66c8bef Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:44 -0500
Subject: sched/numa, mm: Use active_nodes nodemask to limit numa migrations

Use the active_nodes nodemask to make smarter decisions on NUMA migrations.

In order to maximize performance of workloads that do not fit in one NUMA
node, we want to satisfy the following criteria:

  1) keep private memory local to each thread

  2) avoid excessive NUMA migration of pages

  3) distribute shared memory across the active nodes, to
     maximize memory bandwidth available to the workload

This patch accomplishes that by implementing the following policy for
NUMA migrations:

  1) always migrate on a private fault

  2) never migrate to a node that is not in the set of active nodes
     for the numa_group

  3) always migrate from a node outside of the set of active nodes,
     to a node that is in that set

  4) within the set of active nodes in the numa_group, only migrate
     from a node with more NUMA page faults, to a node with fewer
     NUMA page faults, with a 25% margin to avoid ping-ponging

This results in most pages of a workload ending up on the actively
used nodes, with reduced ping-ponging of pages between those nodes.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-6-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  7 ++++++
 kernel/sched/fair.c   | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/mempolicy.c        | 29 +-----------------------
 3 files changed, 71 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5fb0cfb43ecf..5ab3b89fc33e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1589,6 +1589,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
+					int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
 				   int flags)
@@ -1604,6 +1606,11 @@ static inline void set_numabalancing_state(bool enabled)
 static inline void task_numa_free(struct task_struct *p)
 {
 }
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+				struct page *page, int src_nid, int dst_cpu)
+{
+	return true;
+}
 #endif
 
 static inline struct pid *task_pid(struct task_struct *task)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1ee921f1ec35..eeabb33f349e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -954,6 +954,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
 	return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
 }
 
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+				int src_nid, int dst_cpu)
+{
+	struct numa_group *ng = p->numa_group;
+	int dst_nid = cpu_to_node(dst_cpu);
+	int last_cpupid, this_cpupid;
+
+	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+	/*
+	 * Multi-stage node selection is used in conjunction with a periodic
+	 * migration fault to build a temporal task<->page relation. By using
+	 * a two-stage filter we remove short/unlikely relations.
+	 *
+	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+	 * a task's usage of a particular page (n_p) per total usage of this
+	 * page (n_t) (in a given time-span) to a probability.
+	 *
+	 * Our periodic faults will sample this probability and getting the
+	 * same result twice in a row, given these samples are fully
+	 * independent, is then given by P(n)^2, provided our sample period
+	 * is sufficiently short compared to the usage pattern.
+	 *
+	 * This quadric squishes small probabilities, making it less likely we
+	 * act on an unlikely task<->page relation.
+	 */
+	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+	if (!cpupid_pid_unset(last_cpupid) &&
+				cpupid_to_nid(last_cpupid) != dst_nid)
+		return false;
+
+	/* Always allow migrate on private faults */
+	if (cpupid_match_pid(p, last_cpupid))
+		return true;
+
+	/* A shared fault, but p->numa_group has not been set up yet. */
+	if (!ng)
+		return true;
+
+	/*
+	 * Do not migrate if the destination is not a node that
+	 * is actively used by this numa group.
+	 */
+	if (!node_isset(dst_nid, ng->active_nodes))
+		return false;
+
+	/*
+	 * Source is a node that is not actively used by this
+	 * numa group, while the destination is. Migrate.
+	 */
+	if (!node_isset(src_nid, ng->active_nodes))
+		return true;
+
+	/*
+	 * Both source and destination are nodes in active
+	 * use by this numa group. Maximize memory bandwidth
+	 * by migrating from more heavily used groups, to less
+	 * heavily used ones, spreading the load around.
+	 * Use a 1/4 hysteresis to avoid spurious page movement.
+	 */
+	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 68d5c7f7164e..784c11ef7719 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2377,37 +2377,10 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 
 	/* Migrate the page towards the node whose CPU is referencing it */
 	if (pol->flags & MPOL_F_MORON) {
-		int last_cpupid;
-		int this_cpupid;
-
 		polnid = thisnid;
-		this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
 
-		/*
-		 * Multi-stage node selection is used in conjunction
-		 * with a periodic migration fault to build a temporal
-		 * task<->page relation. By using a two-stage filter we
-		 * remove short/unlikely relations.
-		 *
-		 * Using P(p) ~ n_p / n_t as per frequentist
-		 * probability, we can equate a task's usage of a
-		 * particular page (n_p) per total usage of this
-		 * page (n_t) (in a given time-span) to a probability.
-		 *
-		 * Our periodic faults will sample this probability and
-		 * getting the same result twice in a row, given these
-		 * samples are fully independent, is then given by
-		 * P(n)^2, provided our sample period is sufficiently
-		 * short compared to the usage pattern.
-		 *
-		 * This quadric squishes small probabilities, making
-		 * it less likely we act on an unlikely task<->page
-		 * relation.
-		 */
-		last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-		if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
 			goto out;
-		}
 	}
 
 	if (curnid != polnid)
-- 
cgit v1.3-14-g43fede


From 7e2703e6099609adc93679c4d45cd6247f565971 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:45 -0500
Subject: sched/numa: Normalize faults_cpu stats and weigh by CPU use

Tracing the code that decides the active nodes has made it abundantly clear
that the naive implementation of the faults_from code has issues.

Specifically, the garbage collector in some workloads will access orders
of magnitudes more memory than the threads that do all the active work.
This resulted in the node with the garbage collector being marked the only
active node in the group.

This issue is avoided if we weigh the statistics by CPU use of each task in
the numa group, instead of by how many faults each thread has occurred.

To achieve this, we normalize the number of faults to the fraction of faults
that occurred on each node, and then multiply that fraction by the fraction
of CPU time the task has used since the last time task_numa_placement was
invoked.

This way the nodes in the active node mask will be the ones where the tasks
from the numa group are most actively running, and the influence of eg. the
garbage collector and other do-little threads is properly minimized.

On a 4 node system, using CPU use statistics calculated over a longer interval
results in about 1% fewer page migrations with two 32-warehouse specjbb runs
on a 4 node system, and about 5% fewer page migrations, as well as 1% better
throughput, with two 8-warehouse specjbb runs, as compared with the shorter
term statistics kept by the scheduler.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   |  2 ++
 kernel/sched/fair.c   | 53 +++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ab3b89fc33e..ef92953764f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1459,6 +1459,8 @@ struct task_struct {
 	int numa_preferred_nid;
 	unsigned long numa_migrate_retry;
 	u64 node_stamp;			/* migration stamp  */
+	u64 last_task_numa_placement;
+	u64 last_sum_exec_runtime;
 	struct callback_head numa_work;
 
 	struct list_head numa_entry;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc708c53bf03..a561c9e8e382 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1746,6 +1746,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->numa_work.next = &p->numa_work;
 	p->numa_faults_memory = NULL;
 	p->numa_faults_buffer_memory = NULL;
+	p->last_task_numa_placement = 0;
+	p->last_sum_exec_runtime = 0;
 
 	INIT_LIST_HEAD(&p->numa_entry);
 	p->numa_group = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index eeabb33f349e..8fc3a8234817 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -887,6 +887,11 @@ struct numa_group {
 	struct rcu_head rcu;
 	nodemask_t active_nodes;
 	unsigned long total_faults;
+	/*
+	 * Faults_cpu is used to decide whether memory should move
+	 * towards the CPU. As a consequence, these stats are weighted
+	 * more by CPU use than by memory faults.
+	 */
 	unsigned long *faults_cpu;
 	unsigned long faults[0];
 };
@@ -1446,11 +1451,41 @@ static void update_task_scan_period(struct task_struct *p,
 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
 
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+	u64 runtime, delta, now;
+	/* Use the start of this time slice to avoid calculations. */
+	now = p->se.exec_start;
+	runtime = p->se.sum_exec_runtime;
+
+	if (p->last_task_numa_placement) {
+		delta = runtime - p->last_sum_exec_runtime;
+		*period = now - p->last_task_numa_placement;
+	} else {
+		delta = p->se.avg.runnable_avg_sum;
+		*period = p->se.avg.runnable_avg_period;
+	}
+
+	p->last_sum_exec_runtime = runtime;
+	p->last_task_numa_placement = now;
+
+	return delta;
+}
+
 static void task_numa_placement(struct task_struct *p)
 {
 	int seq, nid, max_nid = -1, max_group_nid = -1;
 	unsigned long max_faults = 0, max_group_faults = 0;
 	unsigned long fault_types[2] = { 0, 0 };
+	unsigned long total_faults;
+	u64 runtime, period;
 	spinlock_t *group_lock = NULL;
 
 	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1459,6 +1494,10 @@ static void task_numa_placement(struct task_struct *p)
 	p->numa_scan_seq = seq;
 	p->numa_scan_period_max = task_scan_max(p);
 
+	total_faults = p->numa_faults_locality[0] +
+		       p->numa_faults_locality[1];
+	runtime = numa_get_avg_runtime(p, &period);
+
 	/* If the task is part of a group prevent parallel updates to group stats */
 	if (p->numa_group) {
 		group_lock = &p->numa_group->lock;
@@ -1471,7 +1510,7 @@ static void task_numa_placement(struct task_struct *p)
 		int priv, i;
 
 		for (priv = 0; priv < 2; priv++) {
-			long diff, f_diff;
+			long diff, f_diff, f_weight;
 
 			i = task_faults_idx(nid, priv);
 			diff = -p->numa_faults_memory[i];
@@ -1483,8 +1522,18 @@ static void task_numa_placement(struct task_struct *p)
 			fault_types[priv] += p->numa_faults_buffer_memory[i];
 			p->numa_faults_buffer_memory[i] = 0;
 
+			/*
+			 * Normalize the faults_from, so all tasks in a group
+			 * count according to CPU use, instead of by the raw
+			 * number of faults. Tasks with little runtime have
+			 * little over-all impact on throughput, and thus their
+			 * faults are less important.
+			 */
+			f_weight = div64_u64(runtime << 16, period + 1);
+			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+				   (total_faults + 1);
 			p->numa_faults_cpu[i] >>= 1;
-			p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i];
+			p->numa_faults_cpu[i] += f_weight;
 			p->numa_faults_buffer_cpu[i] = 0;
 
 			faults += p->numa_faults_memory[i];
-- 
cgit v1.3-14-g43fede


From 35664fd41e1c8cc4f0b89f6a51db5af39ba50640 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:46 -0500
Subject: sched/numa: Do statistics calculation using local variables only

The current code in task_numa_placement calculates the difference
between the old and the new value, but also temporarily stores half
of the old value in the per-process variables.

The NUMA balancing code looks at those per-process variables, and
having other tasks temporarily see halved statistics could lead to
unwanted numa migrations. This can be avoided by doing all the math
in local variables.

This change also simplifies the code a little.

Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-8-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8fc3a8234817..4c449907a10e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1513,12 +1513,9 @@ static void task_numa_placement(struct task_struct *p)
 			long diff, f_diff, f_weight;
 
 			i = task_faults_idx(nid, priv);
-			diff = -p->numa_faults_memory[i];
-			f_diff = -p->numa_faults_cpu[i];
 
 			/* Decay existing window, copy faults since last scan */
-			p->numa_faults_memory[i] >>= 1;
-			p->numa_faults_memory[i] += p->numa_faults_buffer_memory[i];
+			diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
 			fault_types[priv] += p->numa_faults_buffer_memory[i];
 			p->numa_faults_buffer_memory[i] = 0;
 
@@ -1532,13 +1529,12 @@ static void task_numa_placement(struct task_struct *p)
 			f_weight = div64_u64(runtime << 16, period + 1);
 			f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
 				   (total_faults + 1);
-			p->numa_faults_cpu[i] >>= 1;
-			p->numa_faults_cpu[i] += f_weight;
+			f_diff = f_weight - p->numa_faults_cpu[i] / 2;
 			p->numa_faults_buffer_cpu[i] = 0;
 
+			p->numa_faults_memory[i] += diff;
+			p->numa_faults_cpu[i] += f_diff;
 			faults += p->numa_faults_memory[i];
-			diff += p->numa_faults_memory[i];
-			f_diff += p->numa_faults_cpu[i];
 			p->total_numa_faults += diff;
 			if (p->numa_group) {
 				/* safe because we can only change our own group */
-- 
cgit v1.3-14-g43fede


From 58b46da336a9312b2e21bb576d1c2c484dbf6257 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:47 -0500
Subject: sched/numa: Rename variables in task_numa_fault()

We track both the node of the memory after a NUMA fault, and the node
of the CPU on which the fault happened. Rename the local variables in
task_numa_fault to make things more explicit.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-9-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4c449907a10e..d5832c367d87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1735,11 +1735,11 @@ void task_numa_free(struct task_struct *p)
 /*
  * Got a PROT_NONE fault for a page on @node.
  */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
 	struct task_struct *p = current;
 	bool migrated = flags & TNF_MIGRATED;
-	int this_node = task_node(current);
+	int cpu_node = task_node(current);
 	int priv;
 
 	if (!numabalancing_enabled)
@@ -1794,8 +1794,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 	if (migrated)
 		p->numa_pages_migrated += pages;
 
-	p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages;
-	p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages;
+	p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+	p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
 	p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 
-- 
cgit v1.3-14-g43fede


From be1e4e760d940c14d119bffef5eb007dfdf29046 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Mon, 27 Jan 2014 17:03:48 -0500
Subject: sched/numa: Turn some magic numbers into #defines

Cleanup suggested by Mel Gorman. Now the code contains some more
hints on what statistics go where.

Suggested-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-10-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d5832c367d87..1f41b122198e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -896,6 +896,15 @@ struct numa_group {
 	unsigned long faults[0];
 };
 
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
 pid_t task_numa_group_id(struct task_struct *p)
 {
 	return p->numa_group ? p->numa_group->gid : 0;
@@ -903,7 +912,7 @@ pid_t task_numa_group_id(struct task_struct *p)
 
 static inline int task_faults_idx(int nid, int priv)
 {
-	return 2 * nid + priv;
+	return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 
 static inline unsigned long task_faults(struct task_struct *p, int nid)
@@ -1509,7 +1518,7 @@ static void task_numa_placement(struct task_struct *p)
 		unsigned long faults = 0, group_faults = 0;
 		int priv, i;
 
-		for (priv = 0; priv < 2; priv++) {
+		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
 			long diff, f_diff, f_weight;
 
 			i = task_faults_idx(nid, priv);
@@ -1620,11 +1629,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 		INIT_LIST_HEAD(&grp->task_list);
 		grp->gid = p->pid;
 		/* Second half of the array tracks nids where faults happen */
-		grp->faults_cpu = grp->faults + 2 * nr_node_ids;
+		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+						nr_node_ids;
 
 		node_set(task_node(current), grp->active_nodes);
 
-		for (i = 0; i < 4*nr_node_ids; i++)
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] = p->numa_faults_memory[i];
 
 		grp->total_faults = p->total_numa_faults;
@@ -1682,7 +1692,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 
 	double_lock(&my_grp->lock, &grp->lock);
 
-	for (i = 0; i < 4*nr_node_ids; i++) {
+	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
 		my_grp->faults[i] -= p->numa_faults_memory[i];
 		grp->faults[i] += p->numa_faults_memory[i];
 	}
@@ -1714,7 +1724,7 @@ void task_numa_free(struct task_struct *p)
 
 	if (grp) {
 		spin_lock(&grp->lock);
-		for (i = 0; i < 4*nr_node_ids; i++)
+		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			grp->faults[i] -= p->numa_faults_memory[i];
 		grp->total_faults -= p->total_numa_faults;
 
@@ -1755,14 +1765,20 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 
 	/* Allocate buffer to track faults on a per-node basis */
 	if (unlikely(!p->numa_faults_memory)) {
-		int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids;
+		int size = sizeof(*p->numa_faults_memory) *
+			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
 
-		/* numa_faults and numa_faults_buffer share the allocation */
-		p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+		p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
 		if (!p->numa_faults_memory)
 			return;
 
 		BUG_ON(p->numa_faults_buffer_memory);
+		/*
+		 * The averaged statistics, shared & private, memory & cpu,
+		 * occupy the first half of the array. The second half of the
+		 * array is for current counters, which are averaged into the
+		 * first set by task_numa_placement.
+		 */
 		p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
 		p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
 		p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
-- 
cgit v1.3-14-g43fede


From 81993e81a994504f4c8b97d3410c9a052cdbcc9d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Sat, 1 Feb 2014 18:54:11 -0800
Subject: compat: Get rid of (get|put)_compat_time(val|spec)

We have two APIs for compatiblity timespec/val, with confusingly
similar names.  compat_(get|put)_time(val|spec) *do* handle the case
where COMPAT_USE_64BIT_TIME is set, whereas
(get|put)_compat_time(val|spec) do not.  This is an accident waiting
to happen.

Clean it up by favoring the full-service version; the limited version
is replaced with double-underscore versions static to kernel/compat.c.

A common pattern is to convert a struct timespec to kernel format in
an allocation on the user stack.  Unfortunately it is open-coded in
several places.  Since this allocation isn't actually needed if
COMPAT_USE_64BIT_TIME is true (since user format == kernel format)
encapsulate that whole pattern into the function
compat_convert_timespec().  An equivalent function should be written
for struct timeval if it is needed in the future.

Finally, get rid of compat_(get|put)_timeval_convert(): each was only
used once, and the latter was not even doing what the function said
(no conversion actually was being done.)  Moving the conversion into
compat_sys_settimeofday() itself makes the code much more similar to
sys_settimeofday() itself.

v3: Remove unused compat_convert_timeval().

v2: Drop bogus "const" in the destination argument for
    compat_convert_time*().

Cc: Mauro Carvalho Chehab <m.chehab@samsung.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Mateusz Guzik <mguzik@redhat.com>
Cc: Rafael Aquini <aquini@redhat.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Tested-by: H.J. Lu <hjl.tools@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 drivers/media/v4l2-core/v4l2-compat-ioctl32.c |   2 +-
 fs/compat.c                                   |   6 +-
 include/linux/compat.h                        |  23 +++---
 ipc/compat.c                                  |  12 +--
 ipc/compat_mq.c                               |  19 +----
 kernel/compat.c                               | 108 +++++++++++++-------------
 kernel/futex_compat.c                         |   2 +-
 7 files changed, 75 insertions(+), 97 deletions(-)

(limited to 'kernel')

diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
index 8f7a6a454a4c..6191968db8fa 100644
--- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
+++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c
@@ -733,7 +733,7 @@ static int put_v4l2_event32(struct v4l2_event *kp, struct v4l2_event32 __user *u
 		copy_to_user(&up->u, &kp->u, sizeof(kp->u)) ||
 		put_user(kp->pending, &up->pending) ||
 		put_user(kp->sequence, &up->sequence) ||
-		put_compat_timespec(&kp->timestamp, &up->timestamp) ||
+		compat_put_timespec(&kp->timestamp, &up->timestamp) ||
 		put_user(kp->id, &up->id) ||
 		copy_to_user(up->reserved, kp->reserved, 8 * sizeof(__u32)))
 			return -EFAULT;
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a3..62092f433759 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -92,8 +92,8 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
 	struct timespec tv[2];
 
 	if  (t) {
-		if (get_compat_timespec(&tv[0], &t[0]) ||
-		    get_compat_timespec(&tv[1], &t[1]))
+		if (compat_get_timespec(&tv[0], &t[0]) ||
+		    compat_get_timespec(&tv[1], &t[1]))
 			return -EFAULT;
 
 		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -512,7 +512,7 @@ compat_sys_io_getevents(aio_context_t ctx_id,
 				nr * sizeof(struct io_event))))
 		goto out;
 	if (timeout) {
-		if (get_compat_timespec(&t, timeout))
+		if (compat_get_timespec(&t, timeout))
 			goto out;
 
 		ut = compat_alloc_user_space(sizeof(*ut));
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 3f448c65511b..f7d5bc0a239f 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -140,27 +140,24 @@ struct compat_sigaction {
 	compat_sigset_t			sa_mask __packed;
 };
 
-/*
- * These functions operate strictly on struct compat_time*
- */
-extern int get_compat_timespec(struct timespec *,
-			       const struct compat_timespec __user *);
-extern int put_compat_timespec(const struct timespec *,
-			       struct compat_timespec __user *);
-extern int get_compat_timeval(struct timeval *,
-			      const struct compat_timeval __user *);
-extern int put_compat_timeval(const struct timeval *,
-			      struct compat_timeval __user *);
 /*
  * These functions operate on 32- or 64-bit specs depending on
- * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments and the
- * naming as compat_get/put_ rather than get/put_compat_.
+ * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments.
  */
 extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
 
+/*
+ * This function convert a timespec if necessary and returns a *user
+ * space* pointer.  If no conversion is necessary, it returns the
+ * initial pointer.  NULL is a legitimate argument and will always
+ * output NULL.
+ */
+extern int compat_convert_timespec(struct timespec __user **,
+				   const void __user *);
+
 struct compat_iovec {
 	compat_uptr_t	iov_base;
 	compat_size_t	iov_len;
diff --git a/ipc/compat.c b/ipc/compat.c
index f486b0096a67..1048522479e5 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -752,14 +752,8 @@ long compat_sys_shmctl(int first, int second, void __user *uptr)
 long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
 		unsigned nsops, const struct compat_timespec __user *timeout)
 {
-	struct timespec __user *ts64 = NULL;
-	if (timeout) {
-		struct timespec ts;
-		ts64 = compat_alloc_user_space(sizeof(*ts64));
-		if (get_compat_timespec(&ts, timeout))
-			return -EFAULT;
-		if (copy_to_user(ts64, &ts, sizeof(ts)))
-			return -EFAULT;
-	}
+	struct timespec __user *ts64;
+	if (compat_convert_timespec(&ts64, timeout))
+		return -EFAULT;
 	return sys_semtimedop(semid, tsems, nsops, ts64);
 }
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
index 63d7c6de335b..a9cf16378d7a 100644
--- a/ipc/compat_mq.c
+++ b/ipc/compat_mq.c
@@ -64,20 +64,6 @@ asmlinkage long compat_sys_mq_open(const char __user *u_name,
 	return sys_mq_open(u_name, oflag, mode, p);
 }
 
-static int compat_prepare_timeout(struct timespec __user **p,
-				  const struct compat_timespec __user *u)
-{
-	struct timespec ts;
-	if (!u) {
-		*p = NULL;
-		return 0;
-	}
-	*p = compat_alloc_user_space(sizeof(ts));
-	if (get_compat_timespec(&ts, u) || copy_to_user(*p, &ts, sizeof(ts)))
-		return -EFAULT;
-	return 0;
-}
-
 asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
 			const char __user *u_msg_ptr,
 			size_t msg_len, unsigned int msg_prio,
@@ -85,7 +71,7 @@ asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
 {
 	struct timespec __user *u_ts;
 
-	if (compat_prepare_timeout(&u_ts, u_abs_timeout))
+	if (compat_convert_timespec(&u_ts, u_abs_timeout))
 		return -EFAULT;
 
 	return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
@@ -98,7 +84,8 @@ asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes,
 			const struct compat_timespec __user *u_abs_timeout)
 {
 	struct timespec __user *u_ts;
-	if (compat_prepare_timeout(&u_ts, u_abs_timeout))
+
+	if (compat_convert_timespec(&u_ts, u_abs_timeout))
 		return -EFAULT;
 
 	return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..3afc524a57ad 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -30,28 +30,6 @@
 
 #include <asm/uaccess.h>
 
-/*
- * Get/set struct timeval with struct timespec on the native side
- */
-static int compat_get_timeval_convert(struct timespec *o,
-				      struct compat_timeval __user *i)
-{
-	long usec;
-
-	if (get_user(o->tv_sec, &i->tv_sec) ||
-	    get_user(usec, &i->tv_usec))
-		return -EFAULT;
-	o->tv_nsec = usec * 1000;
-	return 0;
-}
-
-static int compat_put_timeval_convert(struct compat_timeval __user *o,
-				      struct timeval *i)
-{
-	return (put_user(i->tv_sec, &o->tv_sec) ||
-		put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
-}
-
 static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
 {
 	memset(txc, 0, sizeof(struct timex));
@@ -116,7 +94,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 	if (tv) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
-		if (compat_put_timeval_convert(tv, &ktv))
+		if (compat_put_timeval(&ktv, tv))
 			return -EFAULT;
 	}
 	if (tz) {
@@ -130,59 +108,58 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 		struct timezone __user *tz)
 {
-	struct timespec kts;
-	struct timezone ktz;
+	struct timeval user_tv;
+	struct timespec	new_ts;
+	struct timezone new_tz;
 
 	if (tv) {
-		if (compat_get_timeval_convert(&kts, tv))
+		if (compat_get_timeval(&user_tv, tv))
 			return -EFAULT;
+		new_ts.tv_sec = user_tv.tv_sec;
+		new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC;
 	}
 	if (tz) {
-		if (copy_from_user(&ktz, tz, sizeof(ktz)))
+		if (copy_from_user(&new_tz, tz, sizeof(*tz)))
 			return -EFAULT;
 	}
 
-	return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
+	return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
 }
 
-int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
 {
 	return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
 			__get_user(tv->tv_sec, &ctv->tv_sec) ||
 			__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
 }
-EXPORT_SYMBOL_GPL(get_compat_timeval);
 
-int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+static int __compat_put_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
 {
 	return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
 			__put_user(tv->tv_sec, &ctv->tv_sec) ||
 			__put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
 }
-EXPORT_SYMBOL_GPL(put_compat_timeval);
 
-int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
+static int __compat_get_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
 	return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
 			__get_user(ts->tv_sec, &cts->tv_sec) ||
 			__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
-EXPORT_SYMBOL_GPL(get_compat_timespec);
 
-int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
+static int __compat_put_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
 {
 	return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) ||
 			__put_user(ts->tv_sec, &cts->tv_sec) ||
 			__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
 }
-EXPORT_SYMBOL_GPL(put_compat_timespec);
 
 int compat_get_timeval(struct timeval *tv, const void __user *utv)
 {
 	if (COMPAT_USE_64BIT_TIME)
 		return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
 	else
-		return get_compat_timeval(tv, utv);
+		return __compat_get_timeval(tv, utv);
 }
 EXPORT_SYMBOL_GPL(compat_get_timeval);
 
@@ -191,7 +168,7 @@ int compat_put_timeval(const struct timeval *tv, void __user *utv)
 	if (COMPAT_USE_64BIT_TIME)
 		return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
 	else
-		return put_compat_timeval(tv, utv);
+		return __compat_put_timeval(tv, utv);
 }
 EXPORT_SYMBOL_GPL(compat_put_timeval);
 
@@ -200,7 +177,7 @@ int compat_get_timespec(struct timespec *ts, const void __user *uts)
 	if (COMPAT_USE_64BIT_TIME)
 		return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
 	else
-		return get_compat_timespec(ts, uts);
+		return __compat_get_timespec(ts, uts);
 }
 EXPORT_SYMBOL_GPL(compat_get_timespec);
 
@@ -209,10 +186,33 @@ int compat_put_timespec(const struct timespec *ts, void __user *uts)
 	if (COMPAT_USE_64BIT_TIME)
 		return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
 	else
-		return put_compat_timespec(ts, uts);
+		return __compat_put_timespec(ts, uts);
 }
 EXPORT_SYMBOL_GPL(compat_put_timespec);
 
+int compat_convert_timespec(struct timespec __user **kts,
+			    const void __user *cts)
+{
+	struct timespec ts;
+	struct timespec __user *uts;
+
+	if (!cts || COMPAT_USE_64BIT_TIME) {
+		*kts = (struct timespec __user *)cts;
+		return 0;
+	}
+
+	uts = compat_alloc_user_space(sizeof(ts));
+	if (!uts)
+		return -EFAULT;
+	if (compat_get_timespec(&ts, cts))
+		return -EFAULT;
+	if (copy_to_user(uts, &ts, sizeof(ts)))
+		return -EFAULT;
+
+	*kts = uts;
+	return 0;
+}
+
 static long compat_nanosleep_restart(struct restart_block *restart)
 {
 	struct compat_timespec __user *rmtp;
@@ -229,7 +229,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
 	if (ret) {
 		rmtp = restart->nanosleep.compat_rmtp;
 
-		if (rmtp && put_compat_timespec(&rmt, rmtp))
+		if (rmtp && compat_put_timespec(&rmt, rmtp))
 			return -EFAULT;
 	}
 
@@ -243,7 +243,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 	mm_segment_t oldfs;
 	long ret;
 
-	if (get_compat_timespec(&tu, rqtp))
+	if (compat_get_timespec(&tu, rqtp))
 		return -EFAULT;
 
 	if (!timespec_valid(&tu))
@@ -263,7 +263,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
 		restart->fn = compat_nanosleep_restart;
 		restart->nanosleep.compat_rmtp = rmtp;
 
-		if (rmtp && put_compat_timespec(&rmt, rmtp))
+		if (rmtp && compat_put_timespec(&rmt, rmtp))
 			return -EFAULT;
 	}
 
@@ -647,8 +647,8 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
 int get_compat_itimerspec(struct itimerspec *dst,
 			  const struct compat_itimerspec __user *src)
 {
-	if (get_compat_timespec(&dst->it_interval, &src->it_interval) ||
-	    get_compat_timespec(&dst->it_value, &src->it_value))
+	if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
+	    __compat_get_timespec(&dst->it_value, &src->it_value))
 		return -EFAULT;
 	return 0;
 }
@@ -656,8 +656,8 @@ int get_compat_itimerspec(struct itimerspec *dst,
 int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 			  const struct itimerspec *src)
 {
-	if (put_compat_timespec(&src->it_interval, &dst->it_interval) ||
-	    put_compat_timespec(&src->it_value, &dst->it_value))
+	if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
+	    __compat_put_timespec(&src->it_value, &dst->it_value))
 		return -EFAULT;
 	return 0;
 }
@@ -727,7 +727,7 @@ long compat_sys_clock_settime(clockid_t which_clock,
 	mm_segment_t oldfs;
 	struct timespec ts;
 
-	if (get_compat_timespec(&ts, tp))
+	if (compat_get_timespec(&ts, tp))
 		return -EFAULT;
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
@@ -749,7 +749,7 @@ long compat_sys_clock_gettime(clockid_t which_clock,
 	err = sys_clock_gettime(which_clock,
 				(struct timespec __user *) &ts);
 	set_fs(oldfs);
-	if (!err && put_compat_timespec(&ts, tp))
+	if (!err && compat_put_timespec(&ts, tp))
 		return -EFAULT;
 	return err;
 }
@@ -789,7 +789,7 @@ long compat_sys_clock_getres(clockid_t which_clock,
 	err = sys_clock_getres(which_clock,
 			       (struct timespec __user *) &ts);
 	set_fs(oldfs);
-	if (!err && tp && put_compat_timespec(&ts, tp))
+	if (!err && tp && compat_put_timespec(&ts, tp))
 		return -EFAULT;
 	return err;
 }
@@ -808,7 +808,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
 	set_fs(oldfs);
 
 	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
-	    put_compat_timespec(&tu, rmtp))
+	    compat_put_timespec(&tu, rmtp))
 		return -EFAULT;
 
 	if (err == -ERESTART_RESTARTBLOCK) {
@@ -827,7 +827,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 	struct timespec in, out;
 	struct restart_block *restart;
 
-	if (get_compat_timespec(&in, rqtp))
+	if (compat_get_timespec(&in, rqtp))
 		return -EFAULT;
 
 	oldfs = get_fs();
@@ -838,7 +838,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
 	set_fs(oldfs);
 
 	if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
-	    put_compat_timespec(&out, rmtp))
+	    compat_put_timespec(&out, rmtp))
 		return -EFAULT;
 
 	if (err == -ERESTART_RESTARTBLOCK) {
@@ -1130,7 +1130,7 @@ COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
 	set_fs(KERNEL_DS);
 	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
 	set_fs(old_fs);
-	if (put_compat_timespec(&t, interval))
+	if (compat_put_timespec(&t, interval))
 		return -EFAULT;
 	return ret;
 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index f9f44fd4d34d..55c8c9349cfe 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -183,7 +183,7 @@ COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
 		      cmd == FUTEX_WAIT_BITSET ||
 		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
-		if (get_compat_timespec(&ts, utime))
+		if (compat_get_timespec(&ts, utime))
 			return -EFAULT;
 		if (!timespec_valid(&ts))
 			return -EINVAL;
-- 
cgit v1.3-14-g43fede


From dce44e03b0a3448ad11ac6c6e0cbe299e0400791 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Sun, 2 Feb 2014 17:57:28 -0800
Subject: compat: Fix sparse address space warnings

In compat_sys_old_getrlimit() we pass a kernel pointer to
sys_old_getrlimit() inside a set_fs() bracket.  This is okay, so we
can safely cast the affected pointer to __user.

In compat_clock_nanosleep_restart(), the variable "rmtp" holds a user
pointer.  Annotate it as such.

Both of these warnings are ancient, but were reported by Fengguang
Wu's test system due to other changes.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Toyo Abe <toyoa@mvista.com>
Link: http://lkml.kernel.org/n/tip-507h7cq5e45eg6ygtykon3bf@git.kernel.org
---
 kernel/compat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 3afc524a57ad..7076b57fa52e 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -451,7 +451,7 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
 	mm_segment_t old_fs = get_fs();
 
 	set_fs(KERNEL_DS);
-	ret = sys_old_getrlimit(resource, &r);
+	ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
 	set_fs(old_fs);
 
 	if (!ret) {
@@ -799,7 +799,7 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
 	long err;
 	mm_segment_t oldfs;
 	struct timespec tu;
-	struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp;
+	struct compat_timespec __user *rmtp = restart->nanosleep.compat_rmtp;
 
 	restart->nanosleep.rmtp = (struct timespec __user *) &tu;
 	oldfs = get_fs();
-- 
cgit v1.3-14-g43fede


From 1ff6bbfd13ca2c114a5cb58e1a92d1e5d68ce0b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Jan 2014 18:10:37 -0500
Subject: arm, pm, vmpressure: add missing slab.h includes

arch/arm/mach-tegra/pm.c, kernel/power/console.c and mm/vmpressure.c
were somehow getting slab.h indirectly through cgroup.h which in turn
was getting it indirectly through xattr.h.  A scheduled cgroup change
drops xattr.h inclusion from cgroup.h and breaks compilation of these
three files.  Add explicit slab.h includes to the three files.

A pending cgroup patch depends on this change and it'd be great if
this can be routed through cgroup/for-3.14-fixes branch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Stephen Warren <swarren@wwwdotorg.org>
Cc: Thierry Reding <thierry.reding@gmail.com>
Cc: linux-tegra@vger.kernel.org
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: linux-pm@vger.kernel.org
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: cgroups@vger.kernel.org
---
 arch/arm/mach-tegra/pm.c | 1 +
 kernel/power/console.c   | 1 +
 mm/vmpressure.c          | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c
index 4ae0286b468d..f55b05a29b55 100644
--- a/arch/arm/mach-tegra/pm.c
+++ b/arch/arm/mach-tegra/pm.c
@@ -24,6 +24,7 @@
 #include <linux/cpu_pm.h>
 #include <linux/suspend.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include <linux/clk/tegra.h>
 
 #include <asm/smp_plat.h>
diff --git a/kernel/power/console.c b/kernel/power/console.c
index eacb8bd8cab4..aba9c545a0e3 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,6 +9,7 @@
 #include <linux/kbd_kern.h>
 #include <linux/vt.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include "power.h"
 
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 196970a4541f..d4042e75f7c7 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -19,6 +19,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/eventfd.h>
+#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/printk.h>
 #include <linux/vmpressure.h>
-- 
cgit v1.3-14-g43fede


From 80d767d770fd9c697e434fd080c2db7b5c60c6dd Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 24 Jan 2014 16:41:36 -0500
Subject: time: Fix overflow when HZ is smaller than 60

When compiling for the IA-64 ski emulator, HZ is set to 32 because the
emulation is slow and we don't want to waste too many cycles processing
timers. Alpha also has an option to set HZ to 32.

This causes integer underflow in
kernel/time/jiffies.c:
kernel/time/jiffies.c:66:2: warning: large integer implicitly truncated to unsigned type [-Woverflow]
  .mult  = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
  ^

This patch reduces the JIFFIES_SHIFT value to avoid the overflow.

Signed-off-by: Mikulas Patocka <mikulas@artax.karlin.mff.cuni.cz>
Link: http://lkml.kernel.org/r/alpine.LRH.2.02.1401241639100.23871@file01.intranet.prod.int.rdu2.redhat.com
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/jiffies.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba456fb..a6a5bf53e86d 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -51,7 +51,13 @@
  * HZ shrinks, so values greater than 8 overflow 32bits when
  * HZ=100.
  */
+#if HZ < 34
+#define JIFFIES_SHIFT	6
+#elif HZ < 67
+#define JIFFIES_SHIFT	7
+#else
 #define JIFFIES_SHIFT	8
+#endif
 
 static cycle_t jiffies_read(struct clocksource *cs)
 {
-- 
cgit v1.3-14-g43fede


From e8b175946c16d7001b22620f52d78ab497efc9d0 Mon Sep 17 00:00:00 2001
From: Shaibal Dutta <shaibal.dutta@linaro.org>
Date: Fri, 31 Jan 2014 11:18:24 -0800
Subject: timekeeping: Move clock sync work to power efficient workqueue

For better use of CPU idle time, allow the scheduler to select the CPU
on which the CMOS clock sync work would be scheduled. This improves
idle residency time and conserver power.

This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected.

Signed-off-by: Shaibal Dutta <shaibal.dutta@broadcom.com>
[zoran.markovic@linaro.org: Added commit message. Aligned code.]
Signed-off-by: Zoran Markovic <zoran.markovic@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1391195904-12497-1-git-send-email-zoran.markovic@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index af8d1d4f3d55..419a52cecd20 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work)
 		next.tv_sec++;
 		next.tv_nsec -= NSEC_PER_SEC;
 	}
-	schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
+	queue_delayed_work(system_power_efficient_wq,
+			   &sync_cmos_work, timespec_to_jiffies(&next));
 }
 
 void ntp_notify_cmos_timer(void)
 {
-	schedule_delayed_work(&sync_cmos_work, 0);
+	queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0);
 }
 
 #else
-- 
cgit v1.3-14-g43fede


From 627ee7947e2e83ba565c31c5c9373d6e364b1ecd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 3 Feb 2014 14:34:31 -0800
Subject: clockevents: Serialize calls to clockevents_update_freq() in the core

We can identify the broadcast device in the core and serialize all
callers including interrupts on a different CPU against the update.
Also, disabling interrupts is moved into the core allowing callers to
leave interrutps enabled when calling clockevents_update_freq().

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Soeren Brinkmann <soren.brinkmann@xilinx.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Michal Simek <michal.simek@xilinx.com>
Link: http://lkml.kernel.org/r/1391466877-28908-2-git-send-email-soren.brinkmann@xilinx.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c    | 29 ++++++++++++++++++++++-------
 kernel/time/tick-broadcast.c | 25 +++++++++++++++++++------
 kernel/time/tick-internal.h  |  4 ++++
 3 files changed, 45 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 086ad6043bcb..641d91003a45 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -439,6 +439,16 @@ void clockevents_config_and_register(struct clock_event_device *dev,
 }
 EXPORT_SYMBOL_GPL(clockevents_config_and_register);
 
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+	clockevents_config(dev, freq);
+
+	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+		return 0;
+
+	return clockevents_program_event(dev, dev->next_event, false);
+}
+
 /**
  * clockevents_update_freq - Update frequency and reprogram a clock event device.
  * @dev:	device to modify
@@ -446,17 +456,22 @@ EXPORT_SYMBOL_GPL(clockevents_config_and_register);
  *
  * Reconfigure and reprogram a clock event device in oneshot
  * mode. Must be called on the cpu for which the device delivers per
- * cpu timer events with interrupts disabled!  Returns 0 on success,
- * -ETIME when the event is in the past.
+ * cpu timer events. If called for the broadcast device the core takes
+ * care of serialization.
+ *
+ * Returns 0 on success, -ETIME when the event is in the past.
  */
 int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
-	clockevents_config(dev, freq);
-
-	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
-		return 0;
+	unsigned long flags;
+	int ret;
 
-	return clockevents_program_event(dev, dev->next_event, false);
+	local_irq_save(flags);
+	ret = tick_broadcast_update_freq(dev, freq);
+	if (ret == -ENODEV)
+		ret = __clockevents_update_freq(dev, freq);
+	local_irq_restore(flags);
+	return ret;
 }
 
 /*
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 43780ab5e279..003e6c3663b1 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -120,6 +120,19 @@ int tick_is_broadcast_device(struct clock_event_device *dev)
 	return (dev && tick_broadcast_device.evtdev == dev);
 }
 
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq)
+{
+	int ret = -ENODEV;
+
+	if (tick_is_broadcast_device(dev)) {
+		raw_spin_lock(&tick_broadcast_lock);
+		ret = __clockevents_update_freq(dev, freq);
+		raw_spin_unlock(&tick_broadcast_lock);
+	}
+	return ret;
+}
+
+
 static void err_broadcast(const struct cpumask *mask)
 {
 	pr_crit_once("Failed to broadcast timer tick. Some CPUs may be unresponsive.\n");
@@ -272,12 +285,8 @@ static void tick_do_broadcast(struct cpumask *mask)
  */
 static void tick_do_periodic_broadcast(void)
 {
-	raw_spin_lock(&tick_broadcast_lock);
-
 	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
 	tick_do_broadcast(tmpmask);
-
-	raw_spin_unlock(&tick_broadcast_lock);
 }
 
 /*
@@ -287,13 +296,15 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
 	ktime_t next;
 
+	raw_spin_lock(&tick_broadcast_lock);
+
 	tick_do_periodic_broadcast();
 
 	/*
 	 * The device is in periodic mode. No reprogramming necessary:
 	 */
 	if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
-		return;
+		goto unlock;
 
 	/*
 	 * Setup the next period for devices, which do not have
@@ -306,9 +317,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 		next = ktime_add(next, tick_period);
 
 		if (!clockevents_program_event(dev, next, false))
-			return;
+			goto unlock;
 		tick_do_periodic_broadcast();
 	}
+unlock:
+	raw_spin_unlock(&tick_broadcast_lock);
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 8329669b51ec..26f1c0ba9d81 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -111,6 +111,7 @@ extern int tick_resume_broadcast(void);
 extern void tick_broadcast_init(void);
 extern void
 tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
+int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
 
 #else /* !BROADCAST */
 
@@ -133,6 +134,8 @@ static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
 static inline void tick_suspend_broadcast(void) { }
 static inline int tick_resume_broadcast(void) { return 0; }
 static inline void tick_broadcast_init(void) { }
+static inline int tick_broadcast_update_freq(struct clock_event_device *dev,
+					     u32 freq) { return -ENODEV; }
 
 /*
  * Set the periodic handler in non broadcast mode
@@ -154,5 +157,6 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 
 #endif
 
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
-- 
cgit v1.3-14-g43fede


From fe79a9ba11962a603fb6af68fcb476e64031e46c Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Mon, 3 Feb 2014 14:34:32 -0800
Subject: clockevents: Adjust timer interval when frequency changes

clockevent devices in periodic mode are not updated when the frequency
of the device changes. Issue a dev->set_mode() callback which forces
the device to reevaluate the timer settings.

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Michal Simek <michal.simek@xilinx.com>
Link: http://lkml.kernel.org/r/1391466877-28908-3-git-send-email-soren.brinkmann@xilinx.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 641d91003a45..f85e5fda9c66 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -443,10 +443,13 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
 	clockevents_config(dev, freq);
 
-	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
-		return 0;
+	if (dev->mode == CLOCK_EVT_MODE_ONESHOT)
+		return clockevents_program_event(dev, dev->next_event, false);
+
+	if (dev->mode == CLOCK_EVT_MODE_PERIODIC)
+		dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev);
 
-	return clockevents_program_event(dev, dev->next_event, false);
+	return 0;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From da7e6f45c34d39186b72328bacc4dd86bff60e0a Mon Sep 17 00:00:00 2001
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Date: Fri, 7 Feb 2014 13:36:06 +0530
Subject: time: Change the return type of clockevents_notify() to integer

The broadcast framework can potentially be made use of by archs which do not have an
external clock device as well. Then, it is required that one of the CPUs need
to handle the broadcasting of wakeup IPIs to the CPUs in deep idle. As a
result its local timers should remain functional all the time. For such
a CPU, the BROADCAST_ENTER notification has to fail indicating that its clock
device cannot be shutdown. To make way for this support, change the return
type of tick_broadcast_oneshot_control() and hence clockevents_notify() to
indicate such scenarios.

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: deepthi@linux.vnet.ibm.com
Cc: paulmck@linux.vnet.ibm.com
Cc: fweisbec@gmail.com
Cc: paulus@samba.org
Cc: srivatsa.bhat@linux.vnet.ibm.com
Cc: svaidy@linux.vnet.ibm.com
Cc: peterz@infradead.org
Cc: benh@kernel.crashing.org
Cc: rafael.j.wysocki@intel.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20140207080606.17187.78306.stgit@preeti.in.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h   | 6 +++---
 kernel/time/clockevents.c    | 8 +++++---
 kernel/time/tick-broadcast.c | 6 ++++--
 kernel/time/tick-internal.h  | 6 +++---
 4 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 493aa021c7a9..e0c5a6c418de 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -186,9 +186,9 @@ static inline int tick_check_broadcast_expired(void) { return 0; }
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
-extern void clockevents_notify(unsigned long reason, void *arg);
+extern int clockevents_notify(unsigned long reason, void *arg);
 #else
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
@@ -196,7 +196,7 @@ static inline void clockevents_notify(unsigned long reason, void *arg) {}
 static inline void clockevents_suspend(void) {}
 static inline void clockevents_resume(void) {}
 
-static inline void clockevents_notify(unsigned long reason, void *arg) {}
+static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
 static inline int tick_check_broadcast_expired(void) { return 0; }
 
 #endif
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f85e5fda9c66..ad362c260ef4 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -542,12 +542,13 @@ void clockevents_resume(void)
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 /**
  * clockevents_notify - notification about relevant events
+ * Returns 0 on success, any other value on error
  */
-void clockevents_notify(unsigned long reason, void *arg)
+int clockevents_notify(unsigned long reason, void *arg)
 {
 	struct clock_event_device *dev, *tmp;
 	unsigned long flags;
-	int cpu;
+	int cpu, ret = 0;
 
 	raw_spin_lock_irqsave(&clockevents_lock, flags);
 
@@ -560,7 +561,7 @@ void clockevents_notify(unsigned long reason, void *arg)
 
 	case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
 	case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
-		tick_broadcast_oneshot_control(reason);
+		ret = tick_broadcast_oneshot_control(reason);
 		break;
 
 	case CLOCK_EVT_NOTIFY_CPU_DYING:
@@ -603,6 +604,7 @@ void clockevents_notify(unsigned long reason, void *arg)
 		break;
 	}
 	raw_spin_unlock_irqrestore(&clockevents_lock, flags);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(clockevents_notify);
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 003e6c3663b1..84c8fd91d744 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -646,14 +646,15 @@ again:
 /*
  * Powerstate information: The system enters/leaves a state, where
  * affected devices might stop
+ * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups.
  */
-void tick_broadcast_oneshot_control(unsigned long reason)
+int tick_broadcast_oneshot_control(unsigned long reason)
 {
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
 	unsigned long flags;
 	ktime_t now;
-	int cpu;
+	int cpu, ret = 0;
 
 	/*
 	 * Periodic mode does not care about the enter/exit of power
@@ -759,6 +760,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	}
 out:
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
+	return ret;
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 26f1c0ba9d81..0756c62c219a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -46,7 +46,7 @@ extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
 extern void tick_resume_oneshot(void);
 # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
-extern void tick_broadcast_oneshot_control(unsigned long reason);
+extern int tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
@@ -58,7 +58,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
 	BUG();
 }
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
@@ -87,7 +87,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
 	BUG();
 }
-static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
+static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-- 
cgit v1.3-14-g43fede


From 5d1638acb9f62fa7eb0c07cb85318bbe1f13b227 Mon Sep 17 00:00:00 2001
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Date: Fri, 7 Feb 2014 13:36:32 +0530
Subject: tick: Introduce hrtimer based broadcast

On some architectures, in certain CPU deep idle states the local timers stop.
An external clock device is used to wakeup these CPUs. The kernel support for the
wakeup of these CPUs is provided by the tick broadcast framework by using the
external clock device as the wakeup source.

However not all implementations of architectures provide such an external
clock device. This patch includes support in the broadcast framework to handle
the wakeup of the CPUs in deep idle states on such systems by queuing a hrtimer
on one of the CPUs, which is meant to handle the wakeup of CPUs in deep idle states.

This patchset introduces a pseudo clock device which can be registered by the
archs as tick_broadcast_device in the absence of a real external clock
device. Once registered, the broadcast framework will work as is for these
architectures as long as the archs take care of the BROADCAST_ENTER
notification failing for one of the CPUs. This CPU is made the stand by CPU to
handle wakeup of the CPUs in deep idle and it *must not enter deep idle states*.

The CPU with the earliest wakeup is chosen to be this CPU. Hence this way the
stand by CPU dynamically moves around and so does the hrtimer which is queued
to trigger at the next earliest wakeup time. This is consistent with the case where
an external clock device is present. The smp affinity of this clock device is
set to the CPU with the earliest wakeup. This patchset handles the hotplug of
the stand by CPU as well by moving the hrtimer on to the CPU handling the CPU_DEAD
notification.

Originally-from: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: deepthi@linux.vnet.ibm.com
Cc: paulmck@linux.vnet.ibm.com
Cc: fweisbec@gmail.com
Cc: paulus@samba.org
Cc: srivatsa.bhat@linux.vnet.ibm.com
Cc: svaidy@linux.vnet.ibm.com
Cc: peterz@infradead.org
Cc: benh@kernel.crashing.org
Cc: rafael.j.wysocki@intel.com
Cc: linuxppc-dev@lists.ozlabs.org
Link: http://lkml.kernel.org/r/20140207080632.17187.80532.stgit@preeti.in.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h           |   9 +++
 kernel/time/Makefile                 |   2 +-
 kernel/time/tick-broadcast-hrtimer.c | 106 +++++++++++++++++++++++++++++++++++
 kernel/time/tick-broadcast.c         |  54 +++++++++++++++++-
 4 files changed, 167 insertions(+), 4 deletions(-)
 create mode 100644 kernel/time/tick-broadcast-hrtimer.c

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index e0c5a6c418de..dbe9e1457168 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -62,6 +62,11 @@ enum clock_event_mode {
 #define CLOCK_EVT_FEAT_DYNIRQ		0x000020
 #define CLOCK_EVT_FEAT_PERCPU		0x000040
 
+/*
+ * Clockevent device is based on a hrtimer for broadcast
+ */
+#define CLOCK_EVT_FEAT_HRTIMER		0x000080
+
 /**
  * struct clock_event_device - clock event device descriptor
  * @event_handler:	Assigned by the framework to be called by the low
@@ -83,6 +88,7 @@ enum clock_event_mode {
  * @name:		ptr to clock event name
  * @rating:		variable to rate clock event devices
  * @irq:		IRQ number (only for non CPU local devices)
+ * @bound_on:		Bound on CPU
  * @cpumask:		cpumask to indicate for which CPUs this device works
  * @list:		list head for the management code
  * @owner:		module reference
@@ -113,6 +119,7 @@ struct clock_event_device {
 	const char		*name;
 	int			rating;
 	int			irq;
+	int			bound_on;
 	const struct cpumask	*cpumask;
 	struct list_head	list;
 	struct module		*owner;
@@ -180,9 +187,11 @@ extern int tick_receive_broadcast(void);
 #endif
 
 #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern void tick_setup_hrtimer_broadcast(void);
 extern int tick_check_broadcast_expired(void);
 #else
 static inline int tick_check_broadcast_expired(void) { return 0; }
+static void tick_setup_hrtimer_broadcast(void) {};
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 9250130646f5..06151ef4a744 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -3,7 +3,7 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o tick-broadcast-hrtimer.o
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
new file mode 100644
index 000000000000..92425279312b
--- /dev/null
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -0,0 +1,106 @@
+/*
+ * linux/kernel/time/tick-broadcast-hrtimer.c
+ * This file emulates a local clock event device
+ * via a pseudo clock device.
+ */
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/profile.h>
+#include <linux/clockchips.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#include "tick-internal.h"
+
+static struct hrtimer bctimer;
+
+static void bc_set_mode(enum clock_event_mode mode,
+			struct clock_event_device *bc)
+{
+	switch (mode) {
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		/*
+		 * Note, we cannot cancel the timer here as we might
+		 * run into the following live lock scenario:
+		 *
+		 * cpu 0		cpu1
+		 * lock(broadcast_lock);
+		 *			hrtimer_interrupt()
+		 *			bc_handler()
+		 *			   tick_handle_oneshot_broadcast();
+		 *			    lock(broadcast_lock);
+		 * hrtimer_cancel()
+		 *  wait_for_callback()
+		 */
+		hrtimer_try_to_cancel(&bctimer);
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * This is called from the guts of the broadcast code when the cpu
+ * which is about to enter idle has the earliest broadcast timer event.
+ */
+static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
+{
+	/*
+	 * We try to cancel the timer first. If the callback is on
+	 * flight on some other cpu then we let it handle it. If we
+	 * were able to cancel the timer nothing can rearm it as we
+	 * own broadcast_lock.
+	 *
+	 * However we can also be called from the event handler of
+	 * ce_broadcast_hrtimer itself when it expires. We cannot
+	 * restart the timer because we are in the callback, but we
+	 * can set the expiry time and let the callback return
+	 * HRTIMER_RESTART.
+	 */
+	if (hrtimer_try_to_cancel(&bctimer) >= 0) {
+		hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED);
+		/* Bind the "device" to the cpu */
+		bc->bound_on = smp_processor_id();
+	} else if (bc->bound_on == smp_processor_id()) {
+		hrtimer_set_expires(&bctimer, expires);
+	}
+	return 0;
+}
+
+static struct clock_event_device ce_broadcast_hrtimer = {
+	.set_mode		= bc_set_mode,
+	.set_next_ktime		= bc_set_next,
+	.features		= CLOCK_EVT_FEAT_ONESHOT |
+				  CLOCK_EVT_FEAT_KTIME |
+				  CLOCK_EVT_FEAT_HRTIMER,
+	.rating			= 0,
+	.bound_on		= -1,
+	.min_delta_ns		= 1,
+	.max_delta_ns		= KTIME_MAX,
+	.min_delta_ticks	= 1,
+	.max_delta_ticks	= KTIME_MAX,
+	.mult			= 1,
+	.shift			= 0,
+	.cpumask		= cpu_all_mask,
+};
+
+static enum hrtimer_restart bc_handler(struct hrtimer *t)
+{
+	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
+
+	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+		return HRTIMER_NORESTART;
+
+	return HRTIMER_RESTART;
+}
+
+void tick_setup_hrtimer_broadcast(void)
+{
+	hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	bctimer.function = bc_handler;
+	clockevents_register_device(&ce_broadcast_hrtimer);
+}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 84c8fd91d744..63c7b2d9ed8e 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -643,6 +643,42 @@ again:
 	raw_spin_unlock(&tick_broadcast_lock);
 }
 
+static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
+{
+	if (!(bc->features & CLOCK_EVT_FEAT_HRTIMER))
+		return 0;
+	if (bc->next_event.tv64 == KTIME_MAX)
+		return 0;
+	return bc->bound_on == cpu ? -EBUSY : 0;
+}
+
+static void broadcast_shutdown_local(struct clock_event_device *bc,
+				     struct clock_event_device *dev)
+{
+	/*
+	 * For hrtimer based broadcasting we cannot shutdown the cpu
+	 * local device if our own event is the first one to expire or
+	 * if we own the broadcast timer.
+	 */
+	if (bc->features & CLOCK_EVT_FEAT_HRTIMER) {
+		if (broadcast_needs_cpu(bc, smp_processor_id()))
+			return;
+		if (dev->next_event.tv64 < bc->next_event.tv64)
+			return;
+	}
+	clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+}
+
+static void broadcast_move_bc(int deadcpu)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	if (!bc || !broadcast_needs_cpu(bc, deadcpu))
+		return;
+	/* This moves the broadcast assignment to this cpu */
+	clockevents_program_event(bc, bc->next_event, 1);
+}
+
 /*
  * Powerstate information: The system enters/leaves a state, where
  * affected devices might stop
@@ -661,7 +697,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 	 * states
 	 */
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
-		return;
+		return 0;
 
 	/*
 	 * We are called with preemtion disabled from the depth of the
@@ -672,7 +708,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 	dev = td->evtdev;
 
 	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
-		return;
+		return 0;
 
 	bc = tick_broadcast_device.evtdev;
 
@@ -680,7 +716,7 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
-			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+			broadcast_shutdown_local(bc, dev);
 			/*
 			 * We only reprogram the broadcast timer if we
 			 * did not mark ourself in the force mask and
@@ -693,6 +729,16 @@ int tick_broadcast_oneshot_control(unsigned long reason)
 			    dev->next_event.tv64 < bc->next_event.tv64)
 				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
 		}
+		/*
+		 * If the current CPU owns the hrtimer broadcast
+		 * mechanism, it cannot go deep idle and we remove the
+		 * CPU from the broadcast mask. We don't have to go
+		 * through the EXIT path as the local timer is not
+		 * shutdown.
+		 */
+		ret = broadcast_needs_cpu(bc, cpu);
+		if (ret)
+			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
@@ -866,6 +912,8 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
 	cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
 
+	broadcast_move_bc(cpu);
+
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
-- 
cgit v1.3-14-g43fede


From f1689bb7abec8e2e670d8ad11eaa86d54bad8cfd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 16:00:46 +0100
Subject: time: Fixup fallout from recent clockevent/tick changes

Make the stub function static inline instead of static and move the
clockevents related function into the proper ifdeffed section.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Soren Brinkmann <soren.brinkmann@xilinx.com>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
---
 include/linux/clockchips.h  | 2 +-
 kernel/time/tick-internal.h | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index dbe9e1457168..20a7183f2831 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -191,7 +191,7 @@ extern void tick_setup_hrtimer_broadcast(void);
 extern int tick_check_broadcast_expired(void);
 #else
 static inline int tick_check_broadcast_expired(void) { return 0; }
-static void tick_setup_hrtimer_broadcast(void) {};
+static inline void tick_setup_hrtimer_broadcast(void) {};
 #endif
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 0756c62c219a..7ab92b19965a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -155,8 +155,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
+
 #endif
 
-int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
-- 
cgit v1.3-14-g43fede


From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 6 Feb 2014 15:56:01 -0800
Subject: cgroup: use an ordered workqueue for cgroup destruction

Sometimes the cleanup after memcg hierarchy testing gets stuck in
mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0.

There may turn out to be several causes, but a major cause is this: the
workitem to offline parent can get run before workitem to offline child;
parent's mem_cgroup_reparent_charges() circles around waiting for the
child's pages to be reparented to its lrus, but it's holding cgroup_mutex
which prevents the child from reaching its mem_cgroup_reparent_charges().

Just use an ordered workqueue for cgroup_destroy_wq.

tj: Committing as the temporary fix until the reverse dependency can
    be removed from memcg.  Comment updated accordingly.

Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction")
Suggested-by: Filipe Brandenburger <filbranden@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba37f72..aa95591c1430 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void)
 	/*
 	 * There isn't much point in executing destruction path in
 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
-	 * Use 1 for @max_active.
+	 *
+	 * XXX: Must be ordered to make sure parent is offlined after
+	 * children.  The ordering requirement is for memcg where a
+	 * parent's offline may wait for a child's leading to deadlock.  In
+	 * the long term, this should be fixed from memcg side.
 	 *
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
-	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+	cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
 	BUG_ON(!cgroup_destroy_wq);
 
 	/*
-- 
cgit v1.3-14-g43fede


From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:26:33 -0500
Subject: cgroup: fix error return value in cgroup_mount()

When cgroup_mount() fails to allocate an id for the root, it didn't
set ret before jumping to unlock_drop ending up returning 0 after a
failure.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index aa95591c1430..793f37176077 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		mutex_lock(&cgroup_mutex);
 		mutex_lock(&cgroup_root_mutex);
 
-		root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
-					   0, 1, GFP_KERNEL);
-		if (root_cgrp->id < 0)
+		ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+		if (ret < 0)
 			goto unlock_drop;
+		root_cgrp->id = ret;
 
 		/* Check for name clashes with existing mounts */
 		ret = -EBUSY;
-- 
cgit v1.3-14-g43fede


From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:26:33 -0500
Subject: cgroup: fix error return from cgroup_create()

cgroup_create() was returning 0 after allocation failures.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 793f37176077..0eb7b868e1ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
-	int ssid, err = 0;
+	int ssid, err;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
 
@@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		return -ENOMEM;
 
 	name = cgroup_alloc_name(dentry);
-	if (!name)
+	if (!name) {
+		err = -ENOMEM;
 		goto err_free_cgrp;
+	}
 	rcu_assign_pointer(cgrp->name, name);
 
 	/*
@@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 * a half-baked cgroup.
 	 */
 	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
-	if (cgrp->id < 0)
+	if (cgrp->id < 0) {
+		err = -ENOMEM;
 		goto err_free_name;
+	}
 
 	/*
 	 * Only live parents can have children.  Note that the liveliness
-- 
cgit v1.3-14-g43fede


From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:26:34 -0500
Subject: cgroup: fix locking in cgroup_cfts_commit()

cgroup_cfts_commit() walks the cgroup hierarchy that the target
subsystem is attached to and tries to apply the file changes.  Due to
the convolution with inode locking, it can't keep cgroup_mutex locked
while iterating.  It currently holds only RCU read lock around the
actual iteration and then pins the found cgroup using dget().

Unfortunately, this is incorrect.  Although the iteration does check
cgroup_is_dead() before invoking dget(), there's nothing which
prevents the dentry from going away inbetween.  Note that this is
different from the usual css iterations where css_tryget() is used to
pin the css - css_tryget() tests whether the css can be pinned and
fails if not.

The problem can be solved by simply holding cgroup_mutex instead of
RCU read lock around the iteration, which actually reduces LOC.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0eb7b868e1ab..3edf7163b84f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	 */
 	update_before = cgroup_serial_nr_next;
 
-	mutex_unlock(&cgroup_mutex);
-
 	/* add/rm files for all cgroups created before */
-	rcu_read_lock();
 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
 		struct cgroup *cgrp = css->cgroup;
 
@@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 
 		inode = cgrp->dentry->d_inode;
 		dget(cgrp->dentry);
-		rcu_read_unlock();
-
 		dput(prev);
 		prev = cgrp->dentry;
 
+		mutex_unlock(&cgroup_mutex);
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
 			ret = cgroup_addrm_files(cgrp, cfts, is_add);
-		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
-
-		rcu_read_lock();
 		if (ret)
 			break;
 	}
-	rcu_read_unlock();
+	mutex_unlock(&cgroup_mutex);
 	dput(prev);
 	deactivate_super(sb);
 	return ret;
-- 
cgit v1.3-14-g43fede


From 3ed80a62bf959d34ebd4d553b026fbe7e6fbcc54 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: drop module support

With module supported dropped from net_prio, no controller is using
cgroup module support.  None of actual resource controllers can be
built as a module and we aren't gonna add new controllers which don't
control resources.  This patch drops module support from cgroup.

* cgroup_[un]load_subsys() and cgroup_subsys->module removed.

* As there's no point in distinguishing IS_BUILTIN() and IS_MODULE(),
  cgroup_subsys.h now uses IS_ENABLED() directly.

* enum cgroup_subsys_id now exactly matches the list of enabled
  controllers as ordered in cgroup_subsys.h.

* cgroup_subsys[] is now a contiguously occupied array.  Size
  specification is no longer necessary and dropped.

* for_each_builtin_subsys() is removed and for_each_subsys() is
  updated to not require any locking.

* module ref handling is removed from rebind_subsystems().

* Module related comments dropped.

v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs
    classid handling into core").

v3: Added {} around the if (need_forkexit_callback) block in
    cgroup_post_fork() for readability as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 block/blk-cgroup.c            |   1 -
 include/linux/cgroup.h        |  29 +----
 include/linux/cgroup_subsys.h |  24 ++--
 kernel/cgroup.c               | 284 +++---------------------------------------
 net/core/netclassid_cgroup.c  |   1 -
 net/core/netprio_cgroup.c     |   1 -
 6 files changed, 32 insertions(+), 308 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4e491d9b5292..660d419918a7 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -914,7 +914,6 @@ struct cgroup_subsys blkio_subsys = {
 	.can_attach = blkcg_can_attach,
 	.subsys_id = blkio_subsys_id,
 	.base_cftypes = blkcg_files,
-	.module = THIS_MODULE,
 };
 EXPORT_SYMBOL_GPL(blkio_subsys);
 
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5c097596104b..d842a737d448 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -37,28 +37,13 @@ extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
-extern int cgroup_load_subsys(struct cgroup_subsys *ss);
-extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern int proc_cgroup_show(struct seq_file *, void *);
 
-/*
- * Define the enumeration of all cgroup subsystems.
- *
- * We define ids for builtin subsystems and then modular ones.
- */
+/* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
 enum cgroup_subsys_id {
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 #include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
-	CGROUP_BUILTIN_SUBSYS_COUNT,
-
-	__CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
-
-#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
-#include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
 	CGROUP_SUBSYS_COUNT,
 };
 #undef SUBSYS
@@ -370,10 +355,9 @@ struct css_set {
 	struct list_head cgrp_links;
 
 	/*
-	 * Set of subsystem states, one for each subsystem. This array
-	 * is immutable after creation apart from the init_css_set
-	 * during subsystem registration (at boot time) and modular subsystem
-	 * loading/unloading.
+	 * Set of subsystem states, one for each subsystem. This array is
+	 * immutable after creation apart from the init_css_set during
+	 * subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -620,15 +604,10 @@ struct cgroup_subsys {
 	/* base cftypes, automatically [de]registered with subsys itself */
 	struct cftype *base_cftypes;
 	struct cftype_set base_cftset;
-
-	/* should be defined only by modular subsystems */
-	struct module *module;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
 #include <linux/cgroup_subsys.h>
-#undef IS_SUBSYS_ENABLED
 #undef SUBSYS
 
 /**
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 7b99d717411d..11c42f6a25a8 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,51 +3,51 @@
  *
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
-#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS)
+#if IS_ENABLED(CONFIG_CPUSETS)
 SUBSYS(cpuset)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG)
+#if IS_ENABLED(CONFIG_CGROUP_DEBUG)
 SUBSYS(debug)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED)
+#if IS_ENABLED(CONFIG_CGROUP_SCHED)
 SUBSYS(cpu_cgroup)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT)
+#if IS_ENABLED(CONFIG_CGROUP_CPUACCT)
 SUBSYS(cpuacct)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_MEMCG)
+#if IS_ENABLED(CONFIG_MEMCG)
 SUBSYS(mem_cgroup)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE)
+#if IS_ENABLED(CONFIG_CGROUP_DEVICE)
 SUBSYS(devices)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER)
+#if IS_ENABLED(CONFIG_CGROUP_FREEZER)
 SUBSYS(freezer)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
+#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
 SUBSYS(net_cls)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP)
+#if IS_ENABLED(CONFIG_BLK_CGROUP)
 SUBSYS(blkio)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF)
+#if IS_ENABLED(CONFIG_CGROUP_PERF)
 SUBSYS(perf)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO)
+#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
 SUBSYS(net_prio)
 #endif
 
-#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB)
+#if IS_ENABLED(CONFIG_CGROUP_HUGETLB)
 SUBSYS(hugetlb)
 #endif
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba37f72..ccb16b47e293 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,7 +47,6 @@
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
-#include <linux/module.h>
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
@@ -120,15 +119,9 @@ static struct workqueue_struct *cgroup_destroy_wq;
  */
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 
-/*
- * Generate an array of cgroup subsystem pointers. At boot time, this is
- * populated with the built in subsystems, and modular subsystems are
- * registered after that. The mutable section of this array is protected by
- * cgroup_mutex.
- */
+/* generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
-#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
-static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = {
+static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
 
@@ -258,30 +251,13 @@ static int notify_on_release(const struct cgroup *cgrp)
 		else
 
 /**
- * for_each_subsys - iterate all loaded cgroup subsystems
+ * for_each_subsys - iterate all enabled cgroup subsystems
  * @ss: the iteration cursor
  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
- *
- * Iterates through all loaded subsystems.  Should be called under
- * cgroup_mutex or cgroup_root_mutex.
  */
 #define for_each_subsys(ss, ssid)					\
-	for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; });	\
-	     (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)			\
-		if (!((ss) = cgroup_subsys[(ssid)])) { }		\
-		else
-
-/**
- * for_each_builtin_subsys - iterate all built-in cgroup subsystems
- * @ss: the iteration cursor
- * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
- *
- * Bulit-in subsystems are always present and iteration itself doesn't
- * require any synchronization.
- */
-#define for_each_builtin_subsys(ss, i)					\
-	for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT &&		\
-	     (((ss) = cgroup_subsys[i]) || true); (i)++)
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
+	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 
 /* iterate across the active hierarchies */
 #define for_each_active_root(root)					\
@@ -975,50 +951,24 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
-/*
- * Call with cgroup_mutex held. Drops reference counts on modules, including
- * any duplicate ones that parse_cgroupfs_options took. If this function
- * returns an error, no reference counts are touched.
- */
 static int rebind_subsystems(struct cgroupfs_root *root,
 			     unsigned long added_mask, unsigned removed_mask)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_subsys *ss;
-	unsigned long pinned = 0;
 	int i, ret;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
 	/* Check that any added subsystems are currently free */
-	for_each_subsys(ss, i) {
-		if (!(added_mask & (1 << i)))
-			continue;
-
-		/* is the subsystem mounted elsewhere? */
-		if (ss->root != &cgroup_dummy_root) {
-			ret = -EBUSY;
-			goto out_put;
-		}
-
-		/* pin the module */
-		if (!try_module_get(ss->module)) {
-			ret = -ENOENT;
-			goto out_put;
-		}
-		pinned |= 1 << i;
-	}
-
-	/* subsys could be missing if unloaded between parsing and here */
-	if (added_mask != pinned) {
-		ret = -ENOENT;
-		goto out_put;
-	}
+	for_each_subsys(ss, i)
+		if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
+			return -EBUSY;
 
 	ret = cgroup_populate_dir(cgrp, added_mask);
 	if (ret)
-		goto out_put;
+		return ret;
 
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
@@ -1057,9 +1007,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			RCU_INIT_POINTER(cgrp->subsys[i], NULL);
 
 			cgroup_subsys[i]->root = &cgroup_dummy_root;
-
-			/* subsystem is now free - drop reference on module */
-			module_put(ss->module);
 			root->subsys_mask &= ~bit;
 		}
 	}
@@ -1071,12 +1018,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
 
 	return 0;
-
-out_put:
-	for_each_subsys(ss, i)
-		if (pinned & (1 << i))
-			module_put(ss->module);
-	return ret;
 }
 
 static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
@@ -4506,7 +4447,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return ret;
 }
 
-static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+static void __init cgroup_init_cftsets(struct cgroup_subsys *ss)
 {
 	INIT_LIST_HEAD(&ss->cftsets);
 
@@ -4559,185 +4500,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(online_css(css));
 
 	mutex_unlock(&cgroup_mutex);
-
-	/* this function shouldn't be used with modular subsystems, since they
-	 * need to register a subsys_id, among other things */
-	BUG_ON(ss->module);
 }
 
-/**
- * cgroup_load_subsys: load and register a modular subsystem at runtime
- * @ss: the subsystem to load
- *
- * This function should be called in a modular subsystem's initcall. If the
- * subsystem is built as a module, it will be assigned a new subsys_id and set
- * up for use. If the subsystem is built-in anyway, work is delegated to the
- * simpler cgroup_init_subsys.
- */
-int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
-{
-	struct cgroup_subsys_state *css;
-	int i, ret;
-	struct hlist_node *tmp;
-	struct css_set *cset;
-	unsigned long key;
-
-	/* check name and function validity */
-	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
-	    ss->css_alloc == NULL || ss->css_free == NULL)
-		return -EINVAL;
-
-	/*
-	 * we don't support callbacks in modular subsystems. this check is
-	 * before the ss->module check for consistency; a subsystem that could
-	 * be a module should still have no callbacks even if the user isn't
-	 * compiling it as one.
-	 */
-	if (ss->fork || ss->exit)
-		return -EINVAL;
-
-	/*
-	 * an optionally modular subsystem is built-in: we want to do nothing,
-	 * since cgroup_init_subsys will have already taken care of it.
-	 */
-	if (ss->module == NULL) {
-		/* a sanity check */
-		BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
-		return 0;
-	}
-
-	/* init base cftset */
-	cgroup_init_cftsets(ss);
-
-	mutex_lock(&cgroup_mutex);
-	mutex_lock(&cgroup_root_mutex);
-	cgroup_subsys[ss->subsys_id] = ss;
-
-	/*
-	 * no ss->css_alloc seems to need anything important in the ss
-	 * struct, so this can happen first (i.e. before the dummy root
-	 * attachment).
-	 */
-	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
-	if (IS_ERR(css)) {
-		/* failure case - need to deassign the cgroup_subsys[] slot. */
-		cgroup_subsys[ss->subsys_id] = NULL;
-		mutex_unlock(&cgroup_root_mutex);
-		mutex_unlock(&cgroup_mutex);
-		return PTR_ERR(css);
-	}
-
-	ss->root = &cgroup_dummy_root;
-
-	/* our new subsystem will be attached to the dummy hierarchy. */
-	init_css(css, ss, cgroup_dummy_top);
-
-	/*
-	 * Now we need to entangle the css into the existing css_sets. unlike
-	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
-	 * will need a new pointer to it; done by iterating the css_set_table.
-	 * furthermore, modifying the existing css_sets will corrupt the hash
-	 * table state, so each changed css_set will need its hash recomputed.
-	 * this is all done under the css_set_lock.
-	 */
-	write_lock(&css_set_lock);
-	hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
-		/* skip entries that we already rehashed */
-		if (cset->subsys[ss->subsys_id])
-			continue;
-		/* remove existing entry */
-		hash_del(&cset->hlist);
-		/* set new value */
-		cset->subsys[ss->subsys_id] = css;
-		/* recompute hash and restore entry */
-		key = css_set_hash(cset->subsys);
-		hash_add(css_set_table, &cset->hlist, key);
-	}
-	write_unlock(&css_set_lock);
-
-	ret = online_css(css);
-	if (ret) {
-		ss->css_free(css);
-		goto err_unload;
-	}
-
-	/* success! */
-	mutex_unlock(&cgroup_root_mutex);
-	mutex_unlock(&cgroup_mutex);
-	return 0;
-
-err_unload:
-	mutex_unlock(&cgroup_root_mutex);
-	mutex_unlock(&cgroup_mutex);
-	/* @ss can't be mounted here as try_module_get() would fail */
-	cgroup_unload_subsys(ss);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_load_subsys);
-
-/**
- * cgroup_unload_subsys: unload a modular subsystem
- * @ss: the subsystem to unload
- *
- * This function should be called in a modular subsystem's exitcall. When this
- * function is invoked, the refcount on the subsystem's module will be 0, so
- * the subsystem will not be attached to any hierarchy.
- */
-void cgroup_unload_subsys(struct cgroup_subsys *ss)
-{
-	struct cgrp_cset_link *link;
-	struct cgroup_subsys_state *css;
-
-	BUG_ON(ss->module == NULL);
-
-	/*
-	 * we shouldn't be called if the subsystem is in use, and the use of
-	 * try_module_get() in rebind_subsystems() should ensure that it
-	 * doesn't start being used while we're killing it off.
-	 */
-	BUG_ON(ss->root != &cgroup_dummy_root);
-
-	mutex_lock(&cgroup_mutex);
-	mutex_lock(&cgroup_root_mutex);
-
-	css = cgroup_css(cgroup_dummy_top, ss);
-	if (css)
-		offline_css(css);
-
-	/* deassign the subsys_id */
-	cgroup_subsys[ss->subsys_id] = NULL;
-
-	/*
-	 * disentangle the css from all css_sets attached to the dummy
-	 * top. as in loading, we need to pay our respects to the hashtable
-	 * gods.
-	 */
-	write_lock(&css_set_lock);
-	list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
-		struct css_set *cset = link->cset;
-		unsigned long key;
-
-		hash_del(&cset->hlist);
-		cset->subsys[ss->subsys_id] = NULL;
-		key = css_set_hash(cset->subsys);
-		hash_add(css_set_table, &cset->hlist, key);
-	}
-	write_unlock(&css_set_lock);
-
-	/*
-	 * remove subsystem's css from the cgroup_dummy_top and free it -
-	 * need to free before marking as null because ss->css_free needs
-	 * the cgrp->subsys pointer to find their state.
-	 */
-	if (css)
-		ss->css_free(css);
-	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
-
-	mutex_unlock(&cgroup_root_mutex);
-	mutex_unlock(&cgroup_mutex);
-}
-EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
-
 /**
  * cgroup_init_early - cgroup initialization at system boot
  *
@@ -4763,8 +4527,7 @@ int __init cgroup_init_early(void)
 	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
 	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
 
-	/* at bootup time, we don't worry about modular subsystems */
-	for_each_builtin_subsys(ss, i) {
+	for_each_subsys(ss, i) {
 		BUG_ON(!ss->name);
 		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
 		BUG_ON(!ss->css_alloc);
@@ -4797,7 +4560,7 @@ int __init cgroup_init(void)
 	if (err)
 		return err;
 
-	for_each_builtin_subsys(ss, i) {
+	for_each_subsys(ss, i) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
 	}
@@ -5032,15 +4795,7 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
-		/*
-		 * fork/exit callbacks are supported only for builtin
-		 * subsystems, and the builtin section of the subsys
-		 * array is immutable, so we don't need to lock the
-		 * subsys array here. On the other hand, modular section
-		 * of the array can be freed at module unload, so we
-		 * can't touch that.
-		 */
-		for_each_builtin_subsys(ss, i)
+		for_each_subsys(ss, i)
 			if (ss->fork)
 				ss->fork(child);
 	}
@@ -5105,11 +4860,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
 	if (run_callbacks && need_forkexit_callback) {
-		/*
-		 * fork/exit callbacks are supported only for builtin
-		 * subsystems, see cgroup_post_fork() for details.
-		 */
-		for_each_builtin_subsys(ss, i) {
+		/* see cgroup_post_fork() for details */
+		for_each_subsys(ss, i) {
 			if (ss->exit) {
 				struct cgroup_subsys_state *old_css = cset->subsys[i];
 				struct cgroup_subsys_state *css = task_css(tsk, i);
@@ -5228,11 +4980,7 @@ static int __init cgroup_disable(char *str)
 		if (!*token)
 			continue;
 
-		/*
-		 * cgroup_disable, being at boot time, can't know about
-		 * module subsystems, so we don't worry about them.
-		 */
-		for_each_builtin_subsys(ss, i) {
+		for_each_subsys(ss, i) {
 			if (!strcmp(token, ss->name)) {
 				ss->disabled = 1;
 				printk(KERN_INFO "Disabling %s control group"
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 9fc7f90d034c..9e5ad5d74e60 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -110,5 +110,4 @@ struct cgroup_subsys net_cls_subsys = {
 	.attach			= cgrp_attach,
 	.subsys_id		= net_cls_subsys_id,
 	.base_cftypes		= ss_files,
-	.module			= THIS_MODULE,
 };
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index cc3a31e7dc08..857e1603f9b7 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -252,7 +252,6 @@ struct cgroup_subsys net_prio_subsys = {
 	.attach		= net_prio_attach,
 	.subsys_id	= net_prio_subsys_id,
 	.base_cftypes	= ss_files,
-	.module		= THIS_MODULE,
 };
 
 static int netprio_device_event(struct notifier_block *unused,
-- 
cgit v1.3-14-g43fede


From 073219e995b4a3f8cf1ce8228b7ef440b6994ac0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: clean up cgroup_subsys names and initialization

cgroup_subsys is a bit messier than it needs to be.

* The name of a subsys can be different from its internal identifier
  defined in cgroup_subsys.h.  Most subsystems use the matching name
  but three - cpu, memory and perf_event - use different ones.

* cgroup_subsys_id enums are postfixed with _subsys_id and each
  cgroup_subsys is postfixed with _subsys.  cgroup.h is widely
  included throughout various subsystems, it doesn't and shouldn't
  have claim on such generic names which don't have any qualifier
  indicating that they belong to cgroup.

* cgroup_subsys->subsys_id should always equal the matching
  cgroup_subsys_id enum; however, we require each controller to
  initialize it and then BUG if they don't match, which is a bit
  silly.

This patch cleans up cgroup_subsys names and initialization by doing
the followings.

* cgroup_subsys_id enums are now postfixed with _cgrp_id, and each
  cgroup_subsys with _cgrp_subsys.

* With the above, renaming subsys identifiers to match the userland
  visible names doesn't cause any naming conflicts.  All non-matching
  identifiers are renamed to match the official names.

  cpu_cgroup -> cpu
  mem_cgroup -> memory
  perf -> perf_event

* controllers no longer need to initialize ->subsys_id and ->name.
  They're generated in cgroup core and set automatically during boot.

* Redundant cgroup_subsys declarations removed.

* While updating BUG_ON()s in cgroup_init_early(), convert them to
  WARN()s.  BUGging that early during boot is stupid - the kernel
  can't print anything, even through serial console and the trap
  handler doesn't even link stack frame properly for back-tracing.

This patch doesn't introduce any behavior changes.

v2: Rebased on top of fe1217c4f3f7 ("net: net_cls: move cgroupfs
    classid handling into core").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Acked-by: "David S. Miller" <davem@davemloft.net>
Acked-by: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Aristeu Rozanski <aris@redhat.com>
Acked-by: Ingo Molnar <mingo@redhat.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Thomas Graf <tgraf@suug.ch>
---
 block/blk-cgroup.c             |  8 +++-----
 block/blk-cgroup.h             |  2 +-
 fs/bio.c                       |  2 +-
 include/linux/cgroup.h         |  7 ++++---
 include/linux/cgroup_subsys.h  |  6 +++---
 include/linux/hugetlb_cgroup.h |  2 +-
 include/linux/memcontrol.h     |  2 +-
 include/net/cls_cgroup.h       |  2 +-
 include/net/netprio_cgroup.h   |  2 +-
 kernel/cgroup.c                | 34 ++++++++++++++++++++--------------
 kernel/cgroup_freezer.c        |  8 ++------
 kernel/cpuset.c                | 10 ++++------
 kernel/events/core.c           |  8 +++-----
 kernel/sched/core.c            |  6 ++----
 kernel/sched/cpuacct.c         |  6 ++----
 mm/hugetlb_cgroup.c            |  9 +++------
 mm/memcontrol.c                | 22 ++++++++++------------
 net/core/netclassid_cgroup.c   |  6 ++----
 net/core/netprio_cgroup.c      |  4 +---
 net/ipv4/tcp_memcontrol.c      |  2 +-
 security/device_cgroup.c       |  8 ++------
 21 files changed, 68 insertions(+), 88 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 660d419918a7..1cef07cf9c21 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -906,16 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-struct cgroup_subsys blkio_subsys = {
-	.name = "blkio",
+struct cgroup_subsys blkio_cgrp_subsys = {
 	.css_alloc = blkcg_css_alloc,
 	.css_offline = blkcg_css_offline,
 	.css_free = blkcg_css_free,
 	.can_attach = blkcg_can_attach,
-	.subsys_id = blkio_subsys_id,
 	.base_cftypes = blkcg_files,
 };
-EXPORT_SYMBOL_GPL(blkio_subsys);
+EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -1105,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol)
 
 	/* everything is in place, add intf files for the new policy */
 	if (pol->cftypes)
-		WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes));
+		WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes));
 	ret = 0;
 out_unlock:
 	mutex_unlock(&blkcg_pol_mutex);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 86154eab9523..453b528c8e19 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	return css_to_blkcg(task_css(tsk, blkio_subsys_id));
+	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
diff --git a/fs/bio.c b/fs/bio.c
index 75c49a382239..4872102b839e 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1965,7 +1965,7 @@ int bio_associate_current(struct bio *bio)
 
 	/* associate blkcg if exists */
 	rcu_read_lock();
-	css = task_css(current, blkio_subsys_id);
+	css = task_css(current, blkio_cgrp_id);
 	if (css && css_tryget(css))
 		bio->bi_css = css;
 	rcu_read_unlock();
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d842a737d448..cd6611e622fd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -41,7 +41,7 @@ extern int cgroupstats_build(struct cgroupstats *stats,
 extern int proc_cgroup_show(struct seq_file *, void *);
 
 /* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _subsys_id,
+#define SUBSYS(_x) _x ## _cgrp_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
@@ -573,7 +573,6 @@ struct cgroup_subsys {
 		     struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
-	int subsys_id;
 	int disabled;
 	int early_init;
 
@@ -592,6 +591,8 @@ struct cgroup_subsys {
 	bool broken_hierarchy;
 	bool warned_broken_hierarchy;
 
+	/* the following two fields are initialized automtically during boot */
+	int subsys_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -606,7 +607,7 @@ struct cgroup_subsys {
 	struct cftype_set base_cftset;
 };
 
-#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 11c42f6a25a8..768fe44e19f0 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -12,7 +12,7 @@ SUBSYS(debug)
 #endif
 
 #if IS_ENABLED(CONFIG_CGROUP_SCHED)
-SUBSYS(cpu_cgroup)
+SUBSYS(cpu)
 #endif
 
 #if IS_ENABLED(CONFIG_CGROUP_CPUACCT)
@@ -20,7 +20,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
-SUBSYS(mem_cgroup)
+SUBSYS(memory)
 #endif
 
 #if IS_ENABLED(CONFIG_CGROUP_DEVICE)
@@ -40,7 +40,7 @@ SUBSYS(blkio)
 #endif
 
 #if IS_ENABLED(CONFIG_CGROUP_PERF)
-SUBSYS(perf)
+SUBSYS(perf_event)
 #endif
 
 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 787bba3bf552..0129f89cf98d 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
 
 static inline bool hugetlb_cgroup_disabled(void)
 {
-	if (hugetlb_subsys.disabled)
+	if (hugetlb_cgrp_subsys.disabled)
 		return true;
 	return false;
 }
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index abd0113b6620..eccfb4a4b379 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -162,7 +162,7 @@ extern int do_swap_account;
 
 static inline bool mem_cgroup_disabled(void)
 {
-	if (mem_cgroup_subsys.disabled)
+	if (memory_cgrp_subsys.disabled)
 		return true;
 	return false;
 }
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index 9cf2d5ef38d9..c15d39456e14 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p)
 		return 0;
 
 	rcu_read_lock();
-	classid = container_of(task_css(p, net_cls_subsys_id),
+	classid = container_of(task_css(p, net_cls_cgrp_id),
 			       struct cgroup_cls_state, css)->classid;
 	rcu_read_unlock();
 
diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h
index b7ff5bd3c3c3..f2a9597ff53c 100644
--- a/include/net/netprio_cgroup.h
+++ b/include/net/netprio_cgroup.h
@@ -33,7 +33,7 @@ static inline u32 task_netprioidx(struct task_struct *p)
 	u32 idx;
 
 	rcu_read_lock();
-	css = task_css(p, net_prio_subsys_id);
+	css = task_css(p, net_prio_cgrp_id);
 	idx = css->cgroup->id;
 	rcu_read_unlock();
 	return idx;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ccb16b47e293..fe3f7253aa90 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -120,10 +120,18 @@ static struct workqueue_struct *cgroup_destroy_wq;
 static struct workqueue_struct *cgroup_pidlist_destroy_wq;
 
 /* generate an array of cgroup subsystem pointers */
-#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 static struct cgroup_subsys *cgroup_subsys[] = {
 #include <linux/cgroup_subsys.h>
 };
+#undef SUBSYS
+
+/* array of cgroup subsystem names */
+#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
+static const char *cgroup_subsys_name[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
 
 /*
  * The dummy hierarchy, reserved for the subsystems that are otherwise
@@ -1076,7 +1084,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 
 #ifdef CONFIG_CPUSETS
-	mask = ~(1UL << cpuset_subsys_id);
+	mask = ~(1UL << cpuset_cgrp_id);
 #endif
 
 	memset(opts, 0, sizeof(*opts));
@@ -4528,15 +4536,15 @@ int __init cgroup_init_early(void)
 	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
 
 	for_each_subsys(ss, i) {
-		BUG_ON(!ss->name);
-		BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
-		BUG_ON(!ss->css_alloc);
-		BUG_ON(!ss->css_free);
-		if (ss->subsys_id != i) {
-			printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
-			       ss->name, ss->subsys_id);
-			BUG();
-		}
+		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id,
+		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
+		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
+		     ss->subsys_id, ss->name);
+		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
+		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
+
+		ss->subsys_id = i;
+		ss->name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
 			cgroup_init_subsys(ss);
@@ -5167,11 +5175,9 @@ static struct cftype debug_files[] =  {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys debug_subsys = {
-	.name = "debug",
+struct cgroup_subsys debug_cgrp_subsys = {
 	.css_alloc = debug_css_alloc,
 	.css_free = debug_css_free,
-	.subsys_id = debug_subsys_id,
 	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 6c3154e477f6..98ea26a99076 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
 
 static inline struct freezer *task_freezer(struct task_struct *task)
 {
-	return css_freezer(task_css(task, freezer_subsys_id));
+	return css_freezer(task_css(task, freezer_cgrp_id));
 }
 
 static struct freezer *parent_freezer(struct freezer *freezer)
@@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state)
 	return "THAWED";
 };
 
-struct cgroup_subsys freezer_subsys;
-
 static struct cgroup_subsys_state *
 freezer_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -473,13 +471,11 @@ static struct cftype files[] = {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys freezer_subsys = {
-	.name		= "freezer",
+struct cgroup_subsys freezer_cgrp_subsys = {
 	.css_alloc	= freezer_css_alloc,
 	.css_online	= freezer_css_online,
 	.css_offline	= freezer_css_offline,
 	.css_free	= freezer_css_free,
-	.subsys_id	= freezer_subsys_id,
 	.attach		= freezer_attach,
 	.fork		= freezer_fork,
 	.base_cftypes	= files,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f1..2d018c795fea 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
 /* Retrieve the cpuset for a task */
 static inline struct cpuset *task_cs(struct task_struct *task)
 {
-	return css_cs(task_css(task, cpuset_subsys_id));
+	return css_cs(task_css(task, cpuset_cgrp_id));
 }
 
 static inline struct cpuset *parent_cs(struct cpuset *cs)
@@ -1521,7 +1521,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	struct task_struct *task;
 	struct task_struct *leader = cgroup_taskset_first(tset);
 	struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
-							cpuset_subsys_id);
+							cpuset_cgrp_id);
 	struct cpuset *cs = css_cs(css);
 	struct cpuset *oldcs = css_cs(oldcss);
 	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
@@ -2024,8 +2024,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 	kfree(cs);
 }
 
-struct cgroup_subsys cpuset_subsys = {
-	.name = "cpuset",
+struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_alloc = cpuset_css_alloc,
 	.css_online = cpuset_css_online,
 	.css_offline = cpuset_css_offline,
@@ -2033,7 +2032,6 @@ struct cgroup_subsys cpuset_subsys = {
 	.can_attach = cpuset_can_attach,
 	.cancel_attach = cpuset_cancel_attach,
 	.attach = cpuset_attach,
-	.subsys_id = cpuset_subsys_id,
 	.base_cftypes = files,
 	.early_init = 1,
 };
@@ -2699,7 +2697,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
 		goto out_free;
 
 	rcu_read_lock();
-	css = task_css(tsk, cpuset_subsys_id);
+	css = task_css(tsk, cpuset_cgrp_id);
 	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
 	rcu_read_unlock();
 	if (retval < 0)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..64903731d834 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -342,7 +342,7 @@ struct perf_cgroup {
 static inline struct perf_cgroup *
 perf_cgroup_from_task(struct task_struct *task)
 {
-	return container_of(task_css(task, perf_subsys_id),
+	return container_of(task_css(task, perf_event_cgrp_id),
 			    struct perf_cgroup, css);
 }
 
@@ -595,7 +595,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 
 	rcu_read_lock();
 
-	css = css_from_dir(f.file->f_dentry, &perf_subsys);
+	css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
 	if (IS_ERR(css)) {
 		ret = PTR_ERR(css);
 		goto out;
@@ -8055,9 +8055,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css,
 	task_function_call(task, __perf_cgroup_move, task);
 }
 
-struct cgroup_subsys perf_subsys = {
-	.name		= "perf_event",
-	.subsys_id	= perf_subsys_id,
+struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
 	.exit		= perf_cgroup_exit,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..d4cfc5561830 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7176,7 +7176,7 @@ void sched_move_task(struct task_struct *tsk)
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
-	tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
+	tg = container_of(task_css_check(tsk, cpu_cgrp_id,
 				lockdep_is_held(&tsk->sighand->siglock)),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
@@ -7957,8 +7957,7 @@ static struct cftype cpu_files[] = {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys cpu_cgroup_subsys = {
-	.name		= "cpu",
+struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_free	= cpu_cgroup_css_free,
 	.css_online	= cpu_cgroup_css_online,
@@ -7966,7 +7965,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
-	.subsys_id	= cpu_cgroup_subsys_id,
 	.base_cftypes	= cpu_files,
 	.early_init	= 1,
 };
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
 /* return cpu accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
-	return css_ca(task_css(tsk, cpuacct_subsys_id));
+	return css_ca(task_css(tsk, cpuacct_cgrp_id));
 }
 
 static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
 	rcu_read_unlock();
 }
 
-struct cgroup_subsys cpuacct_subsys = {
-	.name		= "cpuacct",
+struct cgroup_subsys cpuacct_cgrp_subsys = {
 	.css_alloc	= cpuacct_css_alloc,
 	.css_free	= cpuacct_css_free,
-	.subsys_id	= cpuacct_subsys_id,
 	.base_cftypes	= files,
 	.early_init	= 1,
 };
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index cb00829bb466..b135853e68f3 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -30,7 +30,6 @@ struct hugetlb_cgroup {
 #define MEMFILE_IDX(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 
-struct cgroup_subsys hugetlb_subsys __read_mostly;
 static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
 
 static inline
@@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
 static inline
 struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
 {
-	return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id));
+	return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
 }
 
 static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
@@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx)
 	cft = &h->cgroup_files[4];
 	memset(cft, 0, sizeof(*cft));
 
-	WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+	WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files));
 
 	return;
 }
@@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
 	return;
 }
 
-struct cgroup_subsys hugetlb_subsys = {
-	.name = "hugetlb",
+struct cgroup_subsys hugetlb_cgrp_subsys = {
 	.css_alloc	= hugetlb_cgroup_css_alloc,
 	.css_offline	= hugetlb_cgroup_css_offline,
 	.css_free	= hugetlb_cgroup_css_free,
-	.subsys_id	= hugetlb_subsys_id,
 };
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 53385cd4e6f0..04a97bce2270 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -66,8 +66,8 @@
 
 #include <trace/events/vmscan.h>
 
-struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-EXPORT_SYMBOL(mem_cgroup_subsys);
+struct cgroup_subsys memory_cgrp_subsys __read_mostly;
+EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 
-	css = css_from_id(id - 1, &mem_cgroup_subsys);
+	css = css_from_id(id - 1, &memory_cgrp_subsys);
 	return mem_cgroup_from_css(css);
 }
 
@@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 	if (unlikely(!p))
 		return NULL;
 
-	return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id));
+	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
 
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -1702,7 +1702,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 	rcu_read_lock();
 
 	mem_cgrp = memcg->css.cgroup;
-	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
+	task_cgrp = task_cgroup(p, memory_cgrp_id);
 
 	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
 	if (ret < 0) {
@@ -6187,7 +6187,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 
 	ret = -EINVAL;
 	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
-				 &mem_cgroup_subsys);
+				 &memory_cgrp_subsys);
 	if (cfile_css == css && css_tryget(css))
 		ret = 0;
 
@@ -6566,11 +6566,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 		 * unfortunate state in our controller.
 		 */
 		if (parent != root_mem_cgroup)
-			mem_cgroup_subsys.broken_hierarchy = true;
+			memory_cgrp_subsys.broken_hierarchy = true;
 	}
 	mutex_unlock(&memcg_create_mutex);
 
-	return memcg_init_kmem(memcg, &mem_cgroup_subsys);
+	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 
 /*
@@ -7264,9 +7264,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 		mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
 
-struct cgroup_subsys mem_cgroup_subsys = {
-	.name = "memory",
-	.subsys_id = mem_cgroup_subsys_id,
+struct cgroup_subsys memory_cgrp_subsys = {
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
 	.css_offline = mem_cgroup_css_offline,
@@ -7292,7 +7290,7 @@ __setup("swapaccount=", enable_swap_account);
 
 static void __init memsw_file_init(void)
 {
-	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files));
+	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
 }
 
 static void __init enable_swap_cgroup(void)
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 9e5ad5d74e60..b865662fba71 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
 
 struct cgroup_cls_state *task_cls_state(struct task_struct *p)
 {
-	return css_cls_state(task_css(p, net_cls_subsys_id));
+	return css_cls_state(task_css(p, net_cls_cgrp_id));
 }
 EXPORT_SYMBOL_GPL(task_cls_state);
 
@@ -102,12 +102,10 @@ static struct cftype ss_files[] = {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys net_cls_subsys = {
-	.name			= "net_cls",
+struct cgroup_subsys net_cls_cgrp_subsys = {
 	.css_alloc		= cgrp_css_alloc,
 	.css_online		= cgrp_css_online,
 	.css_free		= cgrp_css_free,
 	.attach			= cgrp_attach,
-	.subsys_id		= net_cls_subsys_id,
 	.base_cftypes		= ss_files,
 };
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 857e1603f9b7..d7d23e28fafd 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -244,13 +244,11 @@ static struct cftype ss_files[] = {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys net_prio_subsys = {
-	.name		= "net_prio",
+struct cgroup_subsys net_prio_cgrp_subsys = {
 	.css_alloc	= cgrp_css_alloc,
 	.css_online	= cgrp_css_online,
 	.css_free	= cgrp_css_free,
 	.attach		= net_prio_attach,
-	.subsys_id	= net_prio_subsys_id,
 	.base_cftypes	= ss_files,
 };
 
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index f7e522c558ba..20a0aca9131e 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -219,7 +219,7 @@ static struct cftype tcp_files[] = {
 
 static int __init tcp_memcontrol_init(void)
 {
-	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files));
+	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files));
 	return 0;
 }
 __initcall(tcp_memcontrol_init);
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d3b6d2cd3a06..7f88bcde7c61 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
 
 static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
 {
-	return css_to_devcgroup(task_css(task, devices_subsys_id));
+	return css_to_devcgroup(task_css(task, devices_cgrp_id));
 }
 
-struct cgroup_subsys devices_subsys;
-
 /*
  * called under devcgroup_mutex
  */
@@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = {
 	{ }	/* terminate */
 };
 
-struct cgroup_subsys devices_subsys = {
-	.name = "devices",
+struct cgroup_subsys devices_cgrp_subsys = {
 	.css_alloc = devcgroup_css_alloc,
 	.css_free = devcgroup_css_free,
 	.css_online = devcgroup_online,
 	.css_offline = devcgroup_offline,
-	.subsys_id = devices_subsys_id,
 	.base_cftypes = dev_cgroup_files,
 };
 
-- 
cgit v1.3-14-g43fede


From aec25020f5d4b69aea5317551d1cb7043f6b04fb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: rename cgroup_subsys->subsys_id to ->id

It's no longer referenced outside cgroup core, so renaming is easy.
Let's rename it for consistency & brevity.

This patch is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cd6611e622fd..198c7fcd727e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -548,7 +548,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 	     (task) = cgroup_taskset_next((tset)))			\
 		if (!(skip_css) ||					\
 		    cgroup_taskset_cur_css((tset),			\
-			(skip_css)->ss->subsys_id) != (skip_css))
+			(skip_css)->ss->id) != (skip_css))
 
 /*
  * Control Group subsystem type.
@@ -592,7 +592,7 @@ struct cgroup_subsys {
 	bool warned_broken_hierarchy;
 
 	/* the following two fields are initialized automtically during boot */
-	int subsys_id;
+	int id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fe3f7253aa90..5a77ca0784a6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -198,7 +198,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 					      struct cgroup_subsys *ss)
 {
 	if (ss)
-		return rcu_dereference_check(cgrp->subsys[ss->subsys_id],
+		return rcu_dereference_check(cgrp->subsys[ss->id],
 					     lockdep_is_held(&cgroup_mutex));
 	else
 		return &cgrp->dummy_css;
@@ -3982,7 +3982,7 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL);
+	rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 
@@ -4014,7 +4014,7 @@ static int online_css(struct cgroup_subsys_state *css)
 	if (!ret) {
 		css->flags |= CSS_ONLINE;
 		css->cgroup->nr_css++;
-		rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
+		rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
 	}
 	return ret;
 }
@@ -4034,7 +4034,7 @@ static void offline_css(struct cgroup_subsys_state *css)
 
 	css->flags &= ~CSS_ONLINE;
 	css->cgroup->nr_css--;
-	RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css);
+	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css);
 }
 
 /**
@@ -4065,7 +4065,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	init_css(css, ss, cgrp);
 
-	err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
+	err = cgroup_populate_dir(cgrp, 1 << ss->id);
 	if (err)
 		goto err_free;
 
@@ -4292,7 +4292,7 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
-	cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 
 	/*
 	 * Killing would put the base ref, but we need to keep it alive
@@ -4496,7 +4496,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
 	 * init_css_set is in the subsystem's top cgroup. */
-	init_css_set.subsys[ss->subsys_id] = css;
+	init_css_set.subsys[ss->id] = css;
 
 	need_forkexit_callback |= ss->fork || ss->exit;
 
@@ -4536,14 +4536,14 @@ int __init cgroup_init_early(void)
 	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
 
 	for_each_subsys(ss, i) {
-		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->subsys_id,
+		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
 		     i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
-		     ss->subsys_id, ss->name);
+		     ss->id, ss->name);
 		WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
 		     "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
 
-		ss->subsys_id = i;
+		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
-- 
cgit v1.3-14-g43fede


From 69e943b7d3c2dcca1087e03e556ac6cb0d4433b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:36:58 -0500
Subject: cgroup: update locking in cgroup_show_options()

cgroup_show_options() grabs cgroup_root_mutex to protect the options
changing while printing; however, holding root_mutex or not doesn't
really make much difference for the function.  subsys_mask can be
atomically tested and most of the options aren't allowed to change
anyway once mounted.

The only field which needs synchronization is ->release_agent_path.
This patch introduces a dedicated spinlock to synchronize accesses to
the field and drops cgroup_root_mutex locking from
cgroup_show_options().  The next patch will remove cgroup_root_mutex.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a77ca0784a6..b15058602120 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -92,6 +92,12 @@ static DEFINE_MUTEX(cgroup_mutex);
 
 static DEFINE_MUTEX(cgroup_root_mutex);
 
+/*
+ * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
+ * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
+ */
+static DEFINE_SPINLOCK(release_agent_path_lock);
+
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
 			   lockdep_is_held(&cgroup_mutex),		\
@@ -1034,7 +1040,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(ss, ssid)
 		if (root->subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
@@ -1044,13 +1049,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
 		seq_puts(seq, ",xattr");
+
+	spin_lock(&release_agent_path_lock);
 	if (strlen(root->release_agent_path))
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+	spin_unlock(&release_agent_path_lock);
+
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
-	mutex_unlock(&cgroup_root_mutex);
 	return 0;
 }
 
@@ -1272,8 +1280,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	if (opts.release_agent)
+	if (opts.release_agent) {
+		spin_lock(&release_agent_path_lock);
 		strcpy(root->release_agent_path, opts.release_agent);
+		spin_unlock(&release_agent_path_lock);
+	}
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
@@ -2183,7 +2194,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
 	mutex_lock(&cgroup_root_mutex);
+	spin_lock(&release_agent_path_lock);
 	strcpy(css->cgroup->root->release_agent_path, buffer);
+	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
-- 
cgit v1.3-14-g43fede


From 3417ae1f5f59bbf36c3defbbf2a76c5ca498db2a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 8 Feb 2014 10:37:01 -0500
Subject: cgroup: remove cgroup_root_mutex

cgroup_root_mutex was added to avoid deadlock involving namespace_sem
via cgroup_show_options().  It added a lot of overhead for the small
purpose of it and, because it's nested under cgroup_mutex, it has very
limited usefulness.  The previous patch made cgroup_show_options() not
use cgroup_root_mutex, so nobody needs it anymore.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 42 +-----------------------------------------
 1 file changed, 1 insertion(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b15058602120..0e7829078049 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -70,18 +70,6 @@
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
- *
- * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
- * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
- * release_agent_path and so on.  Modifying requires both cgroup_mutex and
- * cgroup_root_mutex.  Readers can acquire either of the two.  This is to
- * break the following locking order cycle.
- *
- *  A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
- *  B. namespace_sem -> cgroup_mutex
- *
- * B happens only through cgroup_show_options() and using cgroup_root_mutex
- * breaks it.
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
@@ -90,8 +78,6 @@ EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for lockdep */
 static DEFINE_MUTEX(cgroup_mutex);
 #endif
 
-static DEFINE_MUTEX(cgroup_root_mutex);
-
 /*
  * Protects cgroup_subsys->release_agent_path.  Modifying it also requires
  * cgroup_mutex.  Reading requires either cgroup_mutex or this spinlock.
@@ -103,14 +89,6 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
 			   lockdep_is_held(&cgroup_mutex),		\
 			   "cgroup_mutex or RCU read lock required");
 
-#ifdef CONFIG_LOCKDEP
-#define cgroup_assert_mutex_or_root_locked()				\
-	WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) &&	\
-				     !lockdep_is_held(&cgroup_root_mutex)))
-#else
-#define cgroup_assert_mutex_or_root_locked()	do { } while (0)
-#endif
-
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
  * of concurrent destructions.  Use a separate workqueue so that cgroup
@@ -154,11 +132,7 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
 static LIST_HEAD(cgroup_roots);
 static int cgroup_root_count;
 
-/*
- * Hierarchy ID allocation and mapping.  It follows the same exclusion
- * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
- * writes, either for reads.
- */
+/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 static DEFINE_IDR(cgroup_hierarchy_idr);
 
 static struct cgroup_name root_cgroup_name = { .name = "/" };
@@ -973,7 +947,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	int i, ret;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
-	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
 	/* Check that any added subsystems are currently free */
 	for_each_subsys(ss, i)
@@ -1246,7 +1219,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
-	mutex_lock(&cgroup_root_mutex);
 
 	/* See what subsystems are wanted */
 	ret = parse_cgroupfs_options(data, &opts);
@@ -1288,7 +1260,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
  out_unlock:
 	kfree(opts.release_agent);
 	kfree(opts.name);
-	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1331,7 +1302,6 @@ static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 	int id;
 
 	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&cgroup_root_mutex);
 
 	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
 			      GFP_KERNEL);
@@ -1345,7 +1315,6 @@ static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 static void cgroup_exit_root_id(struct cgroupfs_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&cgroup_root_mutex);
 
 	if (root->hierarchy_id) {
 		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
@@ -1524,7 +1493,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
-		mutex_lock(&cgroup_root_mutex);
 
 		root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
 					   0, 1, GFP_KERNEL);
@@ -1597,7 +1565,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
 
-		mutex_unlock(&cgroup_root_mutex);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 	} else {
@@ -1628,7 +1595,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	revert_creds(cred);
  unlock_drop:
 	cgroup_exit_root_id(root);
-	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&inode->i_mutex);
  drop_new_super:
@@ -1653,7 +1619,6 @@ static void cgroup_kill_sb(struct super_block *sb)
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
-	mutex_lock(&cgroup_root_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
 	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
@@ -1682,7 +1647,6 @@ static void cgroup_kill_sb(struct super_block *sb)
 
 	cgroup_exit_root_id(root);
 
-	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 
@@ -2193,11 +2157,9 @@ static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 		return -EINVAL;
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
-	mutex_lock(&cgroup_root_mutex);
 	spin_lock(&release_agent_path_lock);
 	strcpy(css->cgroup->root->release_agent_path, buffer);
 	spin_unlock(&release_agent_path_lock);
-	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
@@ -4588,7 +4550,6 @@ int __init cgroup_init(void)
 
 	/* allocate id for the dummy hierarchy */
 	mutex_lock(&cgroup_mutex);
-	mutex_lock(&cgroup_root_mutex);
 
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
@@ -4600,7 +4561,6 @@ int __init cgroup_init(void)
 			0, 1, GFP_KERNEL);
 	BUG_ON(err < 0);
 
-	mutex_unlock(&cgroup_root_mutex);
 	mutex_unlock(&cgroup_mutex);
 
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
-- 
cgit v1.3-14-g43fede


From 6a02ad66b2c44155d529f430d4fa5c6c66321077 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Feb 2014 18:11:08 +0100
Subject: perf/x86: Push the duration-logging printk() to IRQ context

Calling printk() from NMI context is bad (TM), so move it to IRQ
context.

This also avoids the problem where the printk() time is measured by
the generic NMI duration goo and triggers a second warning.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Link: http://lkml.kernel.org/n/tip-75dv35xf6dhhmeb7nq6fua31@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irq_work.h |  2 ++
 kernel/events/core.c     | 28 +++++++++++++++++++++-------
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index 66017028dcb3..add13c8624b7 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -30,6 +30,8 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 	work->func = func;
 }
 
+#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
+
 void irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
 void irq_work_sync(struct irq_work *work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..2067cbb378eb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 #define NR_ACCUMULATED_SAMPLES 128
 static DEFINE_PER_CPU(u64, running_sample_length);
 
-void perf_sample_event_took(u64 sample_len_ns)
+static void perf_duration_warn(struct irq_work *w)
 {
+	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
 	u64 avg_local_sample_len;
 	u64 local_samples_len;
+
+	local_samples_len = __get_cpu_var(running_sample_length);
+	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+	printk_ratelimited(KERN_WARNING
+			"perf interrupt took too long (%lld > %lld), lowering "
+			"kernel.perf_event_max_sample_rate to %d\n",
+			avg_local_sample_len, allowed_ns,
+			sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
 	u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+	u64 avg_local_sample_len;
+	u64 local_samples_len;
 
 	if (allowed_ns == 0)
 		return;
@@ -263,13 +281,9 @@ void perf_sample_event_took(u64 sample_len_ns)
 	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
 	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 
-	printk_ratelimited(KERN_WARNING
-			"perf samples too long (%lld > %lld), lowering "
-			"kernel.perf_event_max_sample_rate to %d\n",
-			avg_local_sample_len, allowed_ns,
-			sysctl_perf_event_sample_rate);
-
 	update_perf_cpu_limits();
+
+	irq_work_queue(&perf_duration_work);
 }
 
 static atomic64_t perf_event_id;
-- 
cgit v1.3-14-g43fede


From 390f3258cb2d031f1c17aa32e771ebd336e89073 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Tue, 28 Jan 2014 11:26:14 +0400
Subject: sched/deadline: Skip in switched_to_dl() if task is current

When p is current and it's not of dl class, then there are no other
dl taks in the rq. If we had had pushable tasks in some other rq,
they would have been pushed earlier. So, skip "p == rq->curr" case.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Acked-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140128072421.32315.25300.stgit@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..b5700bceee55 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1560,7 +1560,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 	if (unlikely(p->dl.dl_throttled))
 		return;
 
-	if (p->on_rq || rq->curr != p) {
+	if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
 		if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
 			/* Only reschedule if pushing failed */
-- 
cgit v1.3-14-g43fede


From 6b6350f155afdfdf888e18c7bf26950a6d10b0c2 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Mon, 27 Jan 2014 17:15:38 -0500
Subject: sched: Expose some macros related to priority

Some macros in kernel/sched/sched.h about priority are
private to kernel/sched. But they are useful to other
parts of the core kernel.

This patch moves these macros from kernel/sched/sched.h to
include/linux/sched/prio.h so that they are available to
other subsystems.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: raistlin@linux.it
Cc: juri.lelli@gmail.com
Cc: clark.williams@gmail.com
Cc: rostedt@goodmis.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/2b022810905b52d13238466807f4b2a691577180.1390859827.git.yangds.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/prio.h | 18 ++++++++++++++++++
 kernel/sched/sched.h       | 18 ------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 9382ba84d5d0..13216f16762e 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -20,4 +20,22 @@
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
 
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+
 #endif /* _SCHED_PRIO_H */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..b44720d38ae9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -23,24 +23,6 @@ extern atomic_long_t calc_load_tasks;
 extern long calc_load_fold_active(struct rq *this_rq);
 extern void update_cpu_load_active(struct rq *this_rq);
 
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
-
 /*
  * Helpers for converting nanosecond timing to jiffy resolution
  */
-- 
cgit v1.3-14-g43fede


From 849401b66d305f3feb75b6db7459b95ad190552a Mon Sep 17 00:00:00 2001
From: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Date: Sun, 9 Feb 2014 11:32:22 +0530
Subject: tick: Fixup more fallout from hrtimer broadcast mode

The hrtimer mode of broadcast is supported only when
GENERIC_CLOCKEVENTS_BROADCAST and TICK_ONESHOT config options
are enabled. Hence compile in the functions for hrtimer mode
of broadcast only when these options are selected.
Also fix max_delta_ticks value for the pseudo clock device.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/52F719EE.9010304@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h           | 1 +
 kernel/time/Makefile                 | 5 ++++-
 kernel/time/tick-broadcast-hrtimer.c | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 20a7183f2831..2e4cb67f6e56 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -207,6 +207,7 @@ static inline void clockevents_resume(void) {}
 
 static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; }
 static inline int tick_check_broadcast_expired(void) { return 0; }
+static inline void tick_setup_hrtimer_broadcast(void) {};
 
 #endif
 
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 06151ef4a744..57a413fd0ebf 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -3,7 +3,10 @@ obj-y += timeconv.o posix-clock.o alarmtimer.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o tick-broadcast-hrtimer.o
+ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y)
+ obj-y						+= tick-broadcast.o
+ obj-$(CONFIG_TICK_ONESHOT)			+= tick-broadcast-hrtimer.o
+endif
 obj-$(CONFIG_GENERIC_SCHED_CLOCK)		+= sched_clock.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 92425279312b..eb682d5c697c 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -82,7 +82,7 @@ static struct clock_event_device ce_broadcast_hrtimer = {
 	.min_delta_ns		= 1,
 	.max_delta_ns		= KTIME_MAX,
 	.min_delta_ticks	= 1,
-	.max_delta_ticks	= KTIME_MAX,
+	.max_delta_ticks	= ULONG_MAX,
 	.mult			= 1,
 	.shift			= 0,
 	.cpumask		= cpu_all_mask,
-- 
cgit v1.3-14-g43fede


From 0668d3065128d39449c097e62dbdb5707820137d Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Thu, 2 Jan 2014 16:37:32 -0800
Subject: genirq: Add devm_request_any_context_irq()

Some drivers use request_any_context_irq() but there isn't a
devm_* function for it. Add one so that these drivers don't need
to explicitly free the irq on driver detach.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Link: http://lkml.kernel.org/r/1388709460-19222-3-git-send-email-sboyd@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  5 +++++
 kernel/irq/devres.c       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0053adde0ed9..a2678d35b5a2 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -158,6 +158,11 @@ devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
 					 devname, dev_id);
 }
 
+extern int __must_check
+devm_request_any_context_irq(struct device *dev, unsigned int irq,
+		 irq_handler_t handler, unsigned long irqflags,
+		 const char *devname, void *dev_id);
+
 extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 
 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index bd8e788d71e0..1ef0606797c9 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -72,6 +72,51 @@ int devm_request_threaded_irq(struct device *dev, unsigned int irq,
 }
 EXPORT_SYMBOL(devm_request_threaded_irq);
 
+/**
+ *	devm_request_any_context_irq - allocate an interrupt line for a managed device
+ *	@dev: device to request interrupt for
+ *	@irq: Interrupt line to allocate
+ *	@handler: Function to be called when the IRQ occurs
+ *	@thread_fn: function to be called in a threaded interrupt context. NULL
+ *		    for devices which handle everything in @handler
+ *	@irqflags: Interrupt type flags
+ *	@devname: An ascii name for the claiming device
+ *	@dev_id: A cookie passed back to the handler function
+ *
+ *	Except for the extra @dev argument, this function takes the
+ *	same arguments and performs the same function as
+ *	request_any_context_irq().  IRQs requested with this function will be
+ *	automatically freed on driver detach.
+ *
+ *	If an IRQ allocated with this function needs to be freed
+ *	separately, devm_free_irq() must be used.
+ */
+int devm_request_any_context_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, unsigned long irqflags,
+			      const char *devname, void *dev_id)
+{
+	struct irq_devres *dr;
+	int rc;
+
+	dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres),
+			  GFP_KERNEL);
+	if (!dr)
+		return -ENOMEM;
+
+	rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id);
+	if (rc) {
+		devres_free(dr);
+		return rc;
+	}
+
+	dr->irq = irq;
+	dr->dev_id = dev_id;
+	devres_add(dev, dr);
+
+	return 0;
+}
+EXPORT_SYMBOL(devm_request_any_context_irq);
+
 /**
  *	devm_free_irq - free an interrupt
  *	@dev: device to free interrupt for
-- 
cgit v1.3-14-g43fede


From d0ea026808ad81de2af14938448419a95211b938 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Mon, 27 Jan 2014 22:00:45 -0500
Subject: sched: Implement task_nice() as static inline function

As patch "sched: Move the priority specific bits into a new header file" exposes
the priority related macros in linux/sched/prio.h, we don't have to implement
task_nice() in kernel/sched/core.c any more.

This patch implements it in linux/sched/sched.h as static inline function,
saving the kernel stack and enhancing performance a bit.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: clark.williams@gmail.com
Cc: rostedt@goodmis.org
Cc: raistlin@linux.it
Cc: juri.lelli@gmail.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1390878045-7096-1-git-send-email-yangds.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h      | 11 ++++++++++-
 include/linux/sched/prio.h |  1 -
 kernel/sched/core.c        | 26 +++++++-------------------
 kernel/sched/cputime.c     |  4 ++--
 4 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d97d0a8e87dc..e3d556427b2e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2094,7 +2094,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 extern bool yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
-extern int task_nice(const struct task_struct *p);
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
+ */
+static inline int task_nice(const struct task_struct *p)
+{
+	return PRIO_TO_NICE((p)->static_prio);
+}
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index 13216f16762e..410ccb74c9e6 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -27,7 +27,6 @@
  */
 #define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
 #define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
 /*
  * 'User priority' is the nice value converted to something we
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 210a12acf2cd..104c8164e04f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3000,7 +3000,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+	if (task_nice(p) == nice || nice < -20 || nice > 19)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
@@ -3078,7 +3078,7 @@ SYSCALL_DEFINE1(nice, int, increment)
 	if (increment > 40)
 		increment = 40;
 
-	nice = TASK_NICE(current) + increment;
+	nice = task_nice(current) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
@@ -3110,18 +3110,6 @@ int task_prio(const struct task_struct *p)
 	return p->prio - MAX_RT_PRIO;
 }
 
-/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
-	return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
 /**
  * idle_cpu - is a given cpu idle currently?
  * @cpu: the processor in question.
@@ -3321,7 +3309,7 @@ recheck:
 	 */
 	if (user && !capable(CAP_SYS_NICE)) {
 		if (fair_policy(policy)) {
-			if (attr->sched_nice < TASK_NICE(p) &&
+			if (attr->sched_nice < task_nice(p) &&
 			    !can_nice(p, attr->sched_nice))
 				return -EPERM;
 		}
@@ -3345,7 +3333,7 @@ recheck:
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
 		 */
 		if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-			if (!can_nice(p, TASK_NICE(p)))
+			if (!can_nice(p, task_nice(p)))
 				return -EPERM;
 		}
 
@@ -3385,7 +3373,7 @@ recheck:
 	 * If not changing anything there's no need to proceed further:
 	 */
 	if (unlikely(policy == p->policy)) {
-		if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
 			goto change;
 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
 			goto change;
@@ -3837,7 +3825,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	else if (task_has_rt_policy(p))
 		attr.sched_priority = p->rt_priority;
 	else
-		attr.sched_nice = TASK_NICE(p);
+		attr.sched_nice = task_nice(p);
 
 	rcu_read_unlock();
 
@@ -7010,7 +6998,7 @@ void normalize_rt_tasks(void)
 			 * Renice negative nice level userspace
 			 * tasks back to 0:
 			 */
-			if (TASK_NICE(p) < 0 && p->mm)
+			if (task_nice(p) < 0 && p->mm)
 				set_user_nice(p, 0);
 			continue;
 		}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 	p->utimescaled += cputime_scaled;
 	account_group_user_time(p, cputime);
 
-	index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 
 	/* Add user time to cpustat. */
 	task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 	p->gtime += cputime;
 
 	/* Add guest time to cpustat. */
-	if (TASK_NICE(p) > 0) {
+	if (task_nice(p) > 0) {
 		cpustat[CPUTIME_NICE] += (__force u64) cputime;
 		cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
 	} else {
-- 
cgit v1.3-14-g43fede


From fb9edbe98493fcd9df66de926ae9157cbe0e4dcd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Jan 2014 19:20:06 +0100
Subject: lockdep: Make held_lock->check and "int check" argument bool

The "int check" argument of lock_acquire() and held_lock->check are
misleading. This is actually a boolean: 2 means "true", everything
else is "false".

And there is no need to pass 1 or 0 to lock_acquire() depending on
CONFIG_PROVE_LOCKING, __lock_acquire() checks prove_locking at the
start and clears "check" if !CONFIG_PROVE_LOCKING.

Note: probably we can simply kill this member/arg. The only explicit
user of check => 0 is rcu_lock_acquire(), perhaps we can change it to
use lock_acquire(trylock =>, read => 2). __lockdep_no_validate means
check => 0 implicitly, but we can change validate_chain() to check
hlock->instance->key instead. Not to mention it would be nice to get
rid of lockdep_set_novalidate_class().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140120182006.GA26495@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/tty/tty_ldsem.c  | 15 ++++-----------
 include/linux/lockdep.h  | 25 +++++++++----------------
 include/linux/rcupdate.h |  2 +-
 kernel/locking/lockdep.c | 11 ++++-------
 4 files changed, 18 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/drivers/tty/tty_ldsem.c b/drivers/tty/tty_ldsem.c
index d8a55e87877f..0ffb0cbe2823 100644
--- a/drivers/tty/tty_ldsem.c
+++ b/drivers/tty/tty_ldsem.c
@@ -39,17 +39,10 @@
 				lock_acquire(&(l)->dep_map, s, t, r, c, n, i)
 # define __rel(l, n, i)				\
 				lock_release(&(l)->dep_map, n, i)
-# ifdef CONFIG_PROVE_LOCKING
-#  define lockdep_acquire(l, s, t, i)		__acq(l, s, t, 0, 2, NULL, i)
-#  define lockdep_acquire_nest(l, s, t, n, i)	__acq(l, s, t, 0, 2, n, i)
-#  define lockdep_acquire_read(l, s, t, i)	__acq(l, s, t, 1, 2, NULL, i)
-#  define lockdep_release(l, n, i)		__rel(l, n, i)
-# else
-#  define lockdep_acquire(l, s, t, i)		__acq(l, s, t, 0, 1, NULL, i)
-#  define lockdep_acquire_nest(l, s, t, n, i)	__acq(l, s, t, 0, 1, n, i)
-#  define lockdep_acquire_read(l, s, t, i)	__acq(l, s, t, 1, 1, NULL, i)
-#  define lockdep_release(l, n, i)		__rel(l, n, i)
-# endif
+#define lockdep_acquire(l, s, t, i)		__acq(l, s, t, 0, 1, NULL, i)
+#define lockdep_acquire_nest(l, s, t, n, i)	__acq(l, s, t, 0, 1, n, i)
+#define lockdep_acquire_read(l, s, t, i)	__acq(l, s, t, 1, 1, NULL, i)
+#define lockdep_release(l, n, i)		__rel(l, n, i)
 #else
 # define lockdep_acquire(l, s, t, i)		do { } while (0)
 # define lockdep_acquire_nest(l, s, t, n, i)	do { } while (0)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 92b1bfc5da60..1626047c1f26 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -252,9 +252,9 @@ struct held_lock {
 	unsigned int trylock:1;						/* 16 bits */
 
 	unsigned int read:2;        /* see lock_acquire() comment */
-	unsigned int check:2;       /* see lock_acquire() comment */
+	unsigned int check:1;       /* see lock_acquire() comment */
 	unsigned int hardirqs_off:1;
-	unsigned int references:11;					/* 32 bits */
+	unsigned int references:12;					/* 32 bits */
 };
 
 /*
@@ -326,9 +326,8 @@ static inline int lockdep_match_key(struct lockdep_map *lock,
  *
  * Values for check:
  *
- *   0: disabled
- *   1: simple checks (freeing, held-at-exit-time, etc.)
- *   2: full validation
+ *   0: simple checks (freeing, held-at-exit-time, etc.)
+ *   1: full validation
  */
 extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			 int trylock, int read, int check,
@@ -479,15 +478,9 @@ static inline void print_irqtrace_events(struct task_struct *curr)
  * on the per lock-class debug mode:
  */
 
-#ifdef CONFIG_PROVE_LOCKING
- #define lock_acquire_exclusive(l, s, t, n, i)		lock_acquire(l, s, t, 0, 2, n, i)
- #define lock_acquire_shared(l, s, t, n, i)		lock_acquire(l, s, t, 1, 2, n, i)
- #define lock_acquire_shared_recursive(l, s, t, n, i)	lock_acquire(l, s, t, 2, 2, n, i)
-#else
- #define lock_acquire_exclusive(l, s, t, n, i)		lock_acquire(l, s, t, 0, 1, n, i)
- #define lock_acquire_shared(l, s, t, n, i)		lock_acquire(l, s, t, 1, 1, n, i)
- #define lock_acquire_shared_recursive(l, s, t, n, i)	lock_acquire(l, s, t, 2, 1, n, i)
-#endif
+#define lock_acquire_exclusive(l, s, t, n, i)		lock_acquire(l, s, t, 0, 1, n, i)
+#define lock_acquire_shared(l, s, t, n, i)		lock_acquire(l, s, t, 1, 1, n, i)
+#define lock_acquire_shared_recursive(l, s, t, n, i)	lock_acquire(l, s, t, 2, 1, n, i)
 
 #define spin_acquire(l, s, t, i)		lock_acquire_exclusive(l, s, t, NULL, i)
 #define spin_acquire_nest(l, s, t, n, i)	lock_acquire_exclusive(l, s, t, n, i)
@@ -518,13 +511,13 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 # define might_lock(lock) 						\
 do {									\
 	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
-	lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_);	\
+	lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);	\
 	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
 } while (0)
 # define might_lock_read(lock) 						\
 do {									\
 	typecheck(struct lockdep_map *, &(lock)->dep_map);		\
-	lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_);	\
+	lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);	\
 	lock_release(&(lock)->dep_map, 0, _THIS_IP_);			\
 } while (0)
 #else
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 72bf3a01a4ee..adff3c99dcaa 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -314,7 +314,7 @@ static inline bool rcu_lockdep_current_cpu_online(void)
 
 static inline void rcu_lock_acquire(struct lockdep_map *map)
 {
-	lock_acquire(map, 0, 0, 2, 1, NULL, _THIS_IP_);
+	lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
 }
 
 static inline void rcu_lock_release(struct lockdep_map *map)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..8c85a0da5a38 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
 	 * (If lookup_chain_cache() returns with 1 it acquires
 	 * graph_lock for us)
 	 */
-	if (!hlock->trylock && (hlock->check == 2) &&
+	if (!hlock->trylock && hlock->check &&
 	    lookup_chain_cache(curr, hlock, chain_key)) {
 		/*
 		 * Check whether last held lock:
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int class_idx;
 	u64 chain_key;
 
-	if (!prove_locking)
-		check = 1;
-
 	if (unlikely(!debug_locks))
 		return 0;
 
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return 0;
 
-	if (lock->key == &__lockdep_no_validate__)
-		check = 1;
+	if (!prove_locking || lock->key == &__lockdep_no_validate__)
+		check = 0;
 
 	if (subclass < NR_LOCKDEP_CACHING_CLASSES)
 		class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
 
-	if (check == 2 && !mark_irqflags(curr, hlock))
+	if (check && !mark_irqflags(curr, hlock))
 		return 0;
 
 	/* mark it as used: */
-- 
cgit v1.3-14-g43fede


From 1b5ff816cab708ba44c7d7b56b613516269eb577 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Jan 2014 19:20:10 +0100
Subject: lockdep: Don't create the wrong dependency on hlock->check == 0

Test-case:

	DEFINE_MUTEX(m1);
	DEFINE_MUTEX(m2);
	DEFINE_MUTEX(mx);

	void lockdep_should_complain(void)
	{
		lockdep_set_novalidate_class(&mx);

		// m1 -> mx -> m2
		mutex_lock(&m1);
		mutex_lock(&mx);
		mutex_lock(&m2);
		mutex_unlock(&m2);
		mutex_unlock(&mx);
		mutex_unlock(&m1);

		// m2 -> m1 ; should trigger the warning
		mutex_lock(&m2);
		mutex_lock(&m1);
		mutex_unlock(&m1);
		mutex_unlock(&m2);
	}

this doesn't trigger any warning, lockdep can't detect the trivial
deadlock.

This is because lock(&mx) correctly avoids m1 -> mx dependency, it
skips validate_chain() due to mx->check == 0. But lock(&m2) wrongly
adds mx -> m2 and thus m1 -> m2 is not created.

rcu_lock_acquire()->lock_acquire(check => 0) is fine due to read == 2,
so currently only __lockdep_no_validate__ can trigger this problem.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140120182010.GA26498@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8c85a0da5a38..f7eba92cb97c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
 
 	for (;;) {
 		int distance = curr->lockdep_depth - depth + 1;
-		hlock = curr->held_locks + depth-1;
+		hlock = curr->held_locks + depth - 1;
 		/*
 		 * Only non-recursive-read entries get new dependencies
 		 * added:
 		 */
-		if (hlock->read != 2) {
+		if (hlock->read != 2 && hlock->check) {
 			if (!check_prev_add(curr, hlock, next,
 						distance, trylock_loop))
 				return 0;
-- 
cgit v1.3-14-g43fede


From 34d0ed5ea7a72d5961552fb1758a94f0d3f8f3dc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 20 Jan 2014 19:20:13 +0100
Subject: lockdep: Change mark_held_locks() to check hlock->check instead of
 lockdep_no_validate

The __lockdep_no_validate check in mark_held_locks() adds the subtle
and (afaics) unnecessary difference between no-validate and check==0.
And this looks even more inconsistent because __lock_acquire() skips
mark_irqflags()->mark_lock() if !check.

Change mark_held_locks() to check hlock->check instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140120182013.GA26505@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index f7eba92cb97c..bf0c6b0dd9c5 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 
 		BUG_ON(usage_bit >= LOCK_USAGE_STATES);
 
-		if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+		if (!hlock->check)
 			continue;
 
 		if (!mark_lock(curr, hlock, usage_bit))
-- 
cgit v1.3-14-g43fede


From b4f2ab43615e5b36c48fffa99f26aca381839ac6 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 17 Jan 2014 10:04:01 +0100
Subject: sched: Remove 'cpu' parameter from idle_balance()

The cpu parameter passed to idle_balance() is not needed as it could
be retrieved from 'struct rq.'

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: alex.shi@linaro.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1389949444-14821-1-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 2 +-
 kernel/sched/fair.c  | 3 ++-
 kernel/sched/sched.h | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 104c8164e04f..74dd565c2e1b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2705,7 +2705,7 @@ need_resched:
 	pre_schedule(rq, prev);
 
 	if (unlikely(!rq->nr_running))
-		idle_balance(cpu, rq);
+		idle_balance(rq);
 
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4caa8030824d..428bc9d2c383 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6531,12 +6531,13 @@ out:
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-void idle_balance(int this_cpu, struct rq *this_rq)
+void idle_balance(struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	u64 curr_cost = 0;
+	int this_cpu = this_rq->cpu;
 
 	this_rq->idle_stamp = rq_clock(this_rq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b44720d38ae9..82c0e02f2a58 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1158,7 +1158,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern void idle_balance(struct rq *this_rq);
 
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
-- 
cgit v1.3-14-g43fede


From e5fc66119ec97054eefc83f173a7ee9e133c3c3a Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 17 Jan 2014 10:04:02 +0100
Subject: sched: Fix race in idle_balance()

The scheduler main function 'schedule()' checks if there are no more tasks
on the runqueue. Then it checks if a task should be pulled in the current
runqueue in idle_balance() assuming it will go to idle otherwise.

But idle_balance() releases the rq->lock in order to look up the sched
domains and takes the lock again right after. That opens a window where
another cpu may put a task in our runqueue, so we won't go to idle but
we have filled the idle_stamp, thinking we will.

This patch closes the window by checking if the runqueue has been modified
but without pulling a task after taking the lock again, so we won't go to idle
right after in the __schedule() function.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: alex.shi@linaro.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1389949444-14821-2-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 428bc9d2c383..5ebc6817c036 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6589,6 +6589,13 @@ void idle_balance(struct rq *this_rq)
 
 	raw_spin_lock(&this_rq->lock);
 
+	/*
+	 * While browsing the domains, we released the rq lock.
+	 * A task could have be enqueued in the meantime
+	 */
+	if (this_rq->nr_running && !pulled_task)
+		return;
+
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
 		 * We are going idle. next_balance may be set based on
-- 
cgit v1.3-14-g43fede


From 3c4017c13f91069194fce3160944efec50f15a6e Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 17 Jan 2014 10:04:03 +0100
Subject: sched: Move rq->idle_stamp up to the core

idle_balance() modifies the rq->idle_stamp field, making this information
shared across core.c and fair.c.

As we know if the cpu is going to idle or not with the previous patch, let's
encapsulate the rq->idle_stamp information in core.c by moving it up to the
caller.

The idle_balance() function returns true in case a balancing occured and the
cpu won't be idle, false if no balance happened and the cpu is going idle.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: alex.shi@linaro.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1389949444-14821-3-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 11 +++++++++--
 kernel/sched/fair.c  | 14 ++++++--------
 kernel/sched/sched.h |  2 +-
 3 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 74dd565c2e1b..417cf657a606 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2704,8 +2704,15 @@ need_resched:
 
 	pre_schedule(rq, prev);
 
-	if (unlikely(!rq->nr_running))
-		idle_balance(rq);
+	if (unlikely(!rq->nr_running)) {
+		/*
+		 * We must set idle_stamp _before_ calling idle_balance(), such
+		 * that we measure the duration of idle_balance() as idle time.
+		 */
+		rq->idle_stamp = rq_clock(rq);
+		if (idle_balance(rq))
+			rq->idle_stamp = 0;
+	}
 
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5ebc6817c036..04fea7744a9f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6531,7 +6531,7 @@ out:
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-void idle_balance(struct rq *this_rq)
+int idle_balance(struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
@@ -6539,10 +6539,8 @@ void idle_balance(struct rq *this_rq)
 	u64 curr_cost = 0;
 	int this_cpu = this_rq->cpu;
 
-	this_rq->idle_stamp = rq_clock(this_rq);
-
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
-		return;
+		return 0;
 
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6580,10 +6578,8 @@ void idle_balance(struct rq *this_rq)
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
 			next_balance = sd->last_balance + interval;
-		if (pulled_task) {
-			this_rq->idle_stamp = 0;
+		if (pulled_task)
 			break;
-		}
 	}
 	rcu_read_unlock();
 
@@ -6594,7 +6590,7 @@ void idle_balance(struct rq *this_rq)
 	 * A task could have be enqueued in the meantime
 	 */
 	if (this_rq->nr_running && !pulled_task)
-		return;
+		return 1;
 
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
@@ -6606,6 +6602,8 @@ void idle_balance(struct rq *this_rq)
 
 	if (curr_cost > this_rq->max_idle_balance_cost)
 		this_rq->max_idle_balance_cost = curr_cost;
+
+	return pulled_task;
 }
 
 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 82c0e02f2a58..bb89991ee409 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1158,7 +1158,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(struct rq *this_rq);
+extern int idle_balance(struct rq *this_rq);
 
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
-- 
cgit v1.3-14-g43fede


From fed14d45f945042a15b09de48d7d3d58d9455fc4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 11 Feb 2012 06:05:00 +0100
Subject: sched/fair: Track cgroup depth

Track depth in cgroup tree, this is useful for things like
find_matching_se() where you need to get to a common parent of two
sched entities.

Keeping the depth avoids having to calculate it on the spot, which
saves a number of possible cache-misses.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 47 +++++++++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3d556427b2e..555e27d717c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,6 +1078,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fea7744a9f..748a7ac3388f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
 	if (!on_rq) {
-		cfs_rq = cfs_rq_of(&p->se);
-		p->se.vruntime += cfs_rq->min_vruntime;
+		cfs_rq = cfs_rq_of(se);
+		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
 		/*
 		 * migrate_task_rq_fair() will have removed our previous
 		 * contribution, but we must synchronize for ongoing future
 		 * decay.
 		 */
-		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
 	}
 }
@@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */
-- 
cgit v1.3-14-g43fede


From 606dba2e289446600a0b68422ed2019af5355c12 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 11 Feb 2012 06:05:00 +0100
Subject: sched: Push put_prev_task() into pick_next_task()

In order to avoid having to do put/set on a whole cgroup hierarchy
when we context switch, push the put into pick_next_task() so that
both operations are in the same function. Further changes then allow
us to possibly optimize away redundant work.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 21 ++++++++-------------
 kernel/sched/deadline.c  |  5 ++++-
 kernel/sched/fair.c      |  6 +++++-
 kernel/sched/idle_task.c |  6 +++++-
 kernel/sched/rt.c        | 27 ++++++++++++++++-----------
 kernel/sched/sched.h     |  8 +++++++-
 kernel/sched/stop_task.c | 16 ++++++++++------
 7 files changed, 55 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417cf657a606..dedb5f07666e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-	if (prev->on_rq || rq->skip_clock_update < 0)
-		update_rq_clock(rq);
-	prev->sched_class->put_prev_task(rq, prev);
-}
-
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -2714,8 +2707,10 @@ need_resched:
 			rq->idle_stamp = 0;
 	}
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	if (prev->on_rq || rq->skip_clock_update < 0)
+		update_rq_clock(rq);
+
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
@@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5700bceee55..50797d576080 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 748a7ac3388f..c4bb0ac26a7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4655,7 +4655,8 @@ preempt:
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
@@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	do {
 		se = pick_next_entity(cfs_rq);
 		set_next_entity(cfs_rq, se);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..e5c922ac40ce 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
 	/* Trigger the post schedule to do an idle_enter for CFS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..a15ca1c0c7bf 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
-	struct rt_rq *rt_rq;
-
-	rt_rq = &rq->rt;
-
-	if (!rt_rq->rt_nr_running)
-		return NULL;
-
-	if (rt_rq_throttled(rt_rq))
-		return NULL;
+	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+	struct rt_rq *rt_rq = &rq->rt;
+
+	if (!rt_rq->rt_nr_running)
+		return NULL;
+
+	if (rt_rq_throttled(rt_rq))
+		return NULL;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb89991ee409..c534cf4181ab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1105,7 +1105,13 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	/*
+	 * It is the responsibility of the pick_next_task() method that will
+	 * return the next task to call put_prev_task() on the @prev task or
+	 * something equivalent.
+	 */
+	struct task_struct * (*pick_next_task) (struct rq *rq,
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..a4147c9d2017 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->on_rq) {
-		stop->se.exec_start = rq_clock_task(rq);
-		return stop;
-	}
+	if (!stop || !stop->on_rq)
+		return NULL;
 
-	return NULL;
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	stop->se.exec_start = rq_clock_task(rq);
+
+	return stop;
 }
 
 static void
-- 
cgit v1.3-14-g43fede


From f10447998a59b97747c16258a9c6e6a1512f27f3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 11 Feb 2012 06:05:00 +0100
Subject: sched/fair: Clean up the __clear_buddies_*() functions

Slightly easier code flow, no functional changes.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4bb0ac26a7c..846172107ba5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2739,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -2750,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -2761,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 678d5718d8d099421b0dd54c01b0528f4aaf5919 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sat, 11 Feb 2012 06:05:00 +0100
Subject: sched/fair: Optimize cgroup pick_next_task_fair()

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

Measured using:

  mount nodev /cgroup -t cgroup -o cpu
  cd /cgroup
  mkdir a; cd a
  mkdir b; cd b
  mkdir c; cd c
  echo $$ > tasks
  perf stat --repeat 10 -- taskset 1 perf bench sched pipe

PRE :      4.542422684 seconds time elapsed   ( +-  0.33% )
POST:      4.389409991 seconds time elapsed   ( +-  0.32% )

Which shows a significant improvement of ~3.5%

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 110 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 846172107ba5..a81b241ff70f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,17 +2907,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
-	struct sched_entity *left = se;
+	struct sched_entity *left = __pick_first_entity(cfs_rq);
+	struct sched_entity *se;
+
+	/*
+	 * If curr is set we have to see if its left of the leftmost entity
+	 * still in the tree, provided there was anything in the tree at all.
+	 */
+	if (!left || (curr && entity_before(curr, left)))
+		left = curr;
+
+	se = left; /* ideally we run the leftmost entity */
 
 	/*
 	 * Avoid running the skip buddy, if running something else can
 	 * be done without getting too unfair.
 	 */
 	if (cfs_rq->skip == se) {
-		struct sched_entity *second = __pick_next_entity(se);
+		struct sched_entity *second;
+
+		if (se == curr) {
+			second = __pick_first_entity(cfs_rq);
+		} else {
+			second = __pick_next_entity(se);
+			if (!second || (curr && entity_before(curr, second)))
+				second = curr;
+		}
+
 		if (second && wakeup_preempt_entity(second, left) < 1)
 			se = second;
 	}
@@ -2939,7 +2958,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3594,22 +3613,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return;
+		return false;
 
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return;
+		return false;
 
 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return;
+		return true;
 
 	throttle_cfs_rq(cfs_rq);
+	return true;
 }
 
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3719,7 +3739,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
@@ -4658,9 +4678,86 @@ preempt:
 static struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
+	struct task_struct *p;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (!cfs_rq->nr_running)
+		return NULL;
+
+	if (!prev || prev->sched_class != &fair_sched_class)
+		goto simple;
+
+	/*
+	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+	 * likely that a next task is from the same cgroup as the current.
+	 *
+	 * Therefore attempt to avoid putting and setting the entire cgroup
+	 * hierarchy, only change the part that actually changes.
+	 */
+
+	do {
+		struct sched_entity *curr = cfs_rq->curr;
+
+		/*
+		 * Since we got here without doing put_prev_entity() we also
+		 * have to consider cfs_rq->curr. If it is still a runnable
+		 * entity, update_curr() will update its vruntime, otherwise
+		 * forget we've ever seen it.
+		 */
+		if (curr && curr->on_rq)
+			update_curr(cfs_rq);
+		else
+			curr = NULL;
+
+		/*
+		 * This call to check_cfs_rq_runtime() will do the throttle and
+		 * dequeue its entity in the parent(s). Therefore the 'simple'
+		 * nr_running test will indeed be correct.
+		 */
+		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+			goto simple;
+
+		se = pick_next_entity(cfs_rq, curr);
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
+
+	p = task_of(se);
+
+	/*
+	 * Since we haven't yet done put_prev_entity and if the selected task
+	 * is a different task than we started out with, try and touch the
+	 * least amount of cfs_rqs.
+	 */
+	if (prev != p) {
+		struct sched_entity *pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
+	if (hrtick_enabled(rq))
+		hrtick_start_fair(rq, p);
+
+	return p;
+simple:
+	cfs_rq = &rq->cfs;
+#endif
 
 	if (!cfs_rq->nr_running)
 		return NULL;
@@ -4669,12 +4766,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 		prev->sched_class->put_prev_task(rq, prev);
 
 	do {
-		se = pick_next_entity(cfs_rq);
+		se = pick_next_entity(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
-- 
cgit v1.3-14-g43fede


From 6c3b4d44ba2838f00614a5a2d777d4401e0bfd71 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 17 Jan 2014 15:09:39 +0100
Subject: sched: Clean up idle task SMP logic

The idle post_schedule flag is just a vile waste of time, furthermore
it appears unneeded, move the idle_enter_fair() call into
pick_next_task_idle().

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Cc: alex.shi@linaro.org
Cc: mingo@kernel.org
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/n/tip-aljykihtxJt3mkokxi0qZurb@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle_task.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index e5c922ac40ce..721371bf03bd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -19,11 +19,6 @@ static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
 	idle_exit_fair(rq);
 	rq_last_tick_reset(rq);
 }
-
-static void post_schedule_idle(struct rq *rq)
-{
-	idle_enter_fair(rq);
-}
 #endif /* CONFIG_SMP */
 /*
  * Idle tasks are unconditionally rescheduled:
@@ -41,8 +36,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
-	/* Trigger the post schedule to do an idle_enter for CFS */
-	rq->post_schedule = 1;
+	idle_enter_fair(rq);
 #endif
 	return rq->idle;
 }
@@ -106,7 +100,6 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
 	.pre_schedule		= pre_schedule_idle,
-	.post_schedule		= post_schedule_idle,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
-- 
cgit v1.3-14-g43fede


From 327adaedf2218b0e318eb393aa79cf2be64c199f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 11 Feb 2014 00:35:29 +0100
Subject: PM / QoS: Add no_constraints_value field to struct pm_qos_constraints

Add a new field, no_constraints_value, to struct pm_qos_constraints
representing a list of PM QoS constraint requests to be returned by
pm_qos_get_value() when that list of requests is empty.

That field will be equal to default_value for all of the existing
global PM QoS classes and for the resume latency device PM QoS type,
but it will be different from default_value for the new latency
tolerance device PM QoS type introduced by the next changeset.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 1 +
 include/linux/pm_qos.h   | 1 +
 kernel/power/qos.c       | 5 ++++-
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 67c0f4219b02..c754e55f9dcb 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -190,6 +190,7 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	plist_head_init(&c->list);
 	c->target_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
 	c->default_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_RESUME_LATENCY_DEFAULT_VALUE;
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
 
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 88a3680ae74c..2d8ce50877d8 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -77,6 +77,7 @@ struct pm_qos_constraints {
 	struct plist_head list;
 	s32 target_value;	/* Do not change to 64 bit */
 	s32 default_value;
+	s32 no_constraint_value;
 	enum pm_qos_type type;
 	struct blocking_notifier_head *notifiers;
 };
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 8dff9b48075a..e23ae38e647f 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -66,6 +66,7 @@ static struct pm_qos_constraints cpu_dma_constraints = {
 	.list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
 	.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
+	.no_constraint_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
 	.notifiers = &cpu_dma_lat_notifier,
 };
@@ -79,6 +80,7 @@ static struct pm_qos_constraints network_lat_constraints = {
 	.list = PLIST_HEAD_INIT(network_lat_constraints.list),
 	.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
+	.no_constraint_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
 	.type = PM_QOS_MIN,
 	.notifiers = &network_lat_notifier,
 };
@@ -93,6 +95,7 @@ static struct pm_qos_constraints network_tput_constraints = {
 	.list = PLIST_HEAD_INIT(network_tput_constraints.list),
 	.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
+	.no_constraint_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
 	.type = PM_QOS_MAX,
 	.notifiers = &network_throughput_notifier,
 };
@@ -128,7 +131,7 @@ static const struct file_operations pm_qos_power_fops = {
 static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 {
 	if (plist_head_empty(&c->list))
-		return c->default_value;
+		return c->no_constraint_value;
 
 	switch (c->type) {
 	case PM_QOS_MIN:
-- 
cgit v1.3-14-g43fede


From 2d984ad132a87ca2112f81f21039493176a8bca0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 11 Feb 2014 00:35:38 +0100
Subject: PM / QoS: Introcuce latency tolerance device PM QoS type

Add a new latency tolerance device PM QoS type to be use for
specifying active state (RPM_ACTIVE) memory access (DMA) latency
tolerance requirements for devices.  It may be used to prevent
hardware from choosing overly aggressive energy-saving operation
modes (causing too much latency to appear) for the whole platform.

This feature reqiures hardware support, so it only will be
available for devices having a new .set_latency_tolerance()
callback in struct dev_pm_info populated, in which case the
routine pointed to by it should implement whatever is necessary
to transfer the effective requirement value to the hardware.

Whenever the effective latency tolerance changes for the device,
its .set_latency_tolerance() callback will be executed and the
effective value will be passed to it.  If that value is negative,
which means that the list of latency tolerance requirements for
the device is empty, the callback is expected to switch the
underlying hardware latency tolerance control mechanism to an
autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY,
in turn, and the hardware supports a special "no requirement"
setting, the callback is expected to use it.  That allows software
to prevent the hardware from automatically updating the device's
latency tolerance in response to its power state changes (e.g. during
transitions from D3cold to D0), which generally may be done in the
autonomous latency tolerance control mode.

If .set_latency_tolerance() is present for the device, a new
pm_qos_latency_tolerance_us attribute will be present in the
devivce's power directory in sysfs.  Then, user space can use
that attribute to specify its latency tolerance requirement for
the device, if any.  Writing "any" to it means "no requirement, but
do not let the hardware control latency tolerance" and writing
"auto" to it allows the hardware to be switched to the autonomous
mode if there are no other requirements from the kernel side in the
device's list.

This changeset includes a fix from Mika Westerberg.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-devices-power |  27 ++++-
 Documentation/power/pm_qos_interface.txt      |  59 +++++++++--
 drivers/base/power/qos.c                      | 144 ++++++++++++++++++++++----
 drivers/base/power/sysfs.c                    |  65 ++++++++++--
 include/linux/pm.h                            |   1 +
 include/linux/pm_qos.h                        |  12 +++
 kernel/power/qos.c                            |  13 ++-
 7 files changed, 277 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-devices-power b/Documentation/ABI/testing/sysfs-devices-power
index efe449bdf811..7dbf96b724ed 100644
--- a/Documentation/ABI/testing/sysfs-devices-power
+++ b/Documentation/ABI/testing/sysfs-devices-power
@@ -187,7 +187,7 @@ Description:
 		Not all drivers support this attribute.  If it isn't supported,
 		attempts to read or write it will yield I/O errors.
 
-What:		/sys/devices/.../power/pm_qos_latency_us
+What:		/sys/devices/.../power/pm_qos_resume_latency_us
 Date:		March 2012
 Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
 Description:
@@ -205,6 +205,31 @@ Description:
 		This attribute has no effect on system-wide suspend/resume and
 		hibernation.
 
+What:		/sys/devices/.../power/pm_qos_latency_tolerance_us
+Date:		January 2014
+Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
+Description:
+		The /sys/devices/.../power/pm_qos_latency_tolerance_us attribute
+		contains the PM QoS active state latency tolerance limit for the
+		given device in microseconds.  That is the maximum memory access
+		latency the device can suffer without any visible adverse
+		effects on user space functionality.  If that value is the
+		string "any", the latency does not matter to user space at all,
+		but hardware should not be allowed to set the latency tolerance
+		for the device automatically.
+
+		Reading "auto" from this file means that the maximum memory
+		access latency for the device may be determined automatically
+		by the hardware as needed.  Writing "auto" to it allows the
+		hardware to be switched to this mode if there are no other
+		latency tolerance requirements from the kernel side.
+
+		This attribute is only present if the feature controlled by it
+		is supported by the hardware.
+
+		This attribute has no effect on runtime suspend and resume of
+		devices and on system-wide suspend/resume and hibernation.
+
 What:		/sys/devices/.../power/pm_qos_no_power_off
 Date:		September 2012
 Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
index 22cb8f51182a..ed743bbad87c 100644
--- a/Documentation/power/pm_qos_interface.txt
+++ b/Documentation/power/pm_qos_interface.txt
@@ -88,17 +88,19 @@ node.
 
 2. PM QoS per-device latency and flags framework
 
-For each device, there are two lists of PM QoS requests. One is maintained
-along with the aggregated target of resume latency value and the other is for
-PM QoS flags. Values are updated in response to changes of the request list.
+For each device, there are three lists of PM QoS requests. Two of them are
+maintained along with the aggregated targets of resume latency and active
+state latency tolerance (in microseconds) and the third one is for PM QoS flags.
+Values are updated in response to changes of the request list.
 
-Target resume latency value is simply the minimum of the request values held in
-the parameter list elements.  The PM QoS flags aggregate value is a gather
-(bitwise OR) of all list elements' values.  Two device PM QoS flags are defined
-currently: PM_QOS_FLAG_NO_POWER_OFF and PM_QOS_FLAG_REMOTE_WAKEUP.
+The target values of resume latency and active state latency tolerance are
+simply the minimum of the request values held in the parameter list elements.
+The PM QoS flags aggregate value is a gather (bitwise OR) of all list elements'
+values.  Two device PM QoS flags are defined currently: PM_QOS_FLAG_NO_POWER_OFF
+and PM_QOS_FLAG_REMOTE_WAKEUP.
 
-Note: the aggregated target value is implemented in such a way that reading the
-aggregated value does not require any locking mechanism.
+Note: The aggregated target values are implemented in such a way that reading
+the aggregated value does not require any locking mechanism.
 
 
 From kernel mode the use of this interface is the following:
@@ -177,3 +179,42 @@ The callback is called when the aggregated value for any device is changed
 int dev_pm_qos_remove_global_notifier(notifier):
 Removes the notification callback function from the global notification tree
 of the framework.
+
+
+Active state latency tolerance
+
+This device PM QoS type is used to support systems in which hardware may switch
+to energy-saving operation modes on the fly.  In those systems, if the operation
+mode chosen by the hardware attempts to save energy in an overly aggressive way,
+it may cause excess latencies to be visible to software, causing it to miss
+certain protocol requirements or target frame or sample rates etc.
+
+If there is a latency tolerance control mechanism for a given device available
+to software, the .set_latency_tolerance callback in that device's dev_pm_info
+structure should be populated.  The routine pointed to by it is should implement
+whatever is necessary to transfer the effective requirement value to the
+hardware.
+
+Whenever the effective latency tolerance changes for the device, its
+.set_latency_tolerance() callback will be executed and the effective value will
+be passed to it.  If that value is negative, which means that the list of
+latency tolerance requirements for the device is empty, the callback is expected
+to switch the underlying hardware latency tolerance control mechanism to an
+autonomous mode if available.  If that value is PM_QOS_LATENCY_ANY, in turn, and
+the hardware supports a special "no requirement" setting, the callback is
+expected to use it.  That allows software to prevent the hardware from
+automatically updating the device's latency tolerance in response to its power
+state changes (e.g. during transitions from D3cold to D0), which generally may
+be done in the autonomous latency tolerance control mode.
+
+If .set_latency_tolerance() is present for the device, sysfs attribute
+pm_qos_latency_tolerance_us will be present in the devivce's power directory.
+Then, user space can use that attribute to specify its latency tolerance
+requirement for the device, if any.  Writing "any" to it means "no requirement,
+but do not let the hardware control latency tolerance" and writing "auto" to it
+allows the hardware to be switched to the autonomous mode if there are no other
+requirements from the kernel side in the device's list.
+
+Kernel code can use the functions described above along with the
+DEV_PM_QOS_LATENCY_TOLERANCE device PM QoS type to add, remove and update
+latency tolerance requirements for devices.
diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index c754e55f9dcb..84756f7f09d9 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -151,6 +151,14 @@ static int apply_constraint(struct dev_pm_qos_request *req,
 						     req);
 		}
 		break;
+	case DEV_PM_QOS_LATENCY_TOLERANCE:
+		ret = pm_qos_update_target(&qos->latency_tolerance,
+					   &req->data.pnode, action, value);
+		if (ret) {
+			value = pm_qos_read_value(&qos->latency_tolerance);
+			req->dev->power.set_latency_tolerance(req->dev, value);
+		}
+		break;
 	case DEV_PM_QOS_FLAGS:
 		ret = pm_qos_update_flags(&qos->flags, &req->data.flr,
 					  action, value);
@@ -194,6 +202,13 @@ static int dev_pm_qos_constraints_allocate(struct device *dev)
 	c->type = PM_QOS_MIN;
 	c->notifiers = n;
 
+	c = &qos->latency_tolerance;
+	plist_head_init(&c->list);
+	c->target_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
+	c->default_value = PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE;
+	c->no_constraint_value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
+	c->type = PM_QOS_MIN;
+
 	INIT_LIST_HEAD(&qos->flags.list);
 
 	spin_lock_irq(&dev->power.lock);
@@ -247,6 +262,11 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
 		memset(req, 0, sizeof(*req));
 	}
+	c = &qos->latency_tolerance;
+	plist_for_each_entry_safe(req, tmp, &c->list, data.pnode) {
+		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
+		memset(req, 0, sizeof(*req));
+	}
 	f = &qos->flags;
 	list_for_each_entry_safe(req, tmp, &f->list, data.flr.node) {
 		apply_constraint(req, PM_QOS_REMOVE_REQ, PM_QOS_DEFAULT_VALUE);
@@ -266,6 +286,40 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 	mutex_unlock(&dev_pm_qos_sysfs_mtx);
 }
 
+static bool dev_pm_qos_invalid_request(struct device *dev,
+				       struct dev_pm_qos_request *req)
+{
+	return !req || (req->type == DEV_PM_QOS_LATENCY_TOLERANCE
+			&& !dev->power.set_latency_tolerance);
+}
+
+static int __dev_pm_qos_add_request(struct device *dev,
+				    struct dev_pm_qos_request *req,
+				    enum dev_pm_qos_req_type type, s32 value)
+{
+	int ret = 0;
+
+	if (!dev || dev_pm_qos_invalid_request(dev, req))
+		return -EINVAL;
+
+	if (WARN(dev_pm_qos_request_active(req),
+		 "%s() called for already added request\n", __func__))
+		return -EINVAL;
+
+	if (IS_ERR(dev->power.qos))
+		ret = -ENODEV;
+	else if (!dev->power.qos)
+		ret = dev_pm_qos_constraints_allocate(dev);
+
+	trace_dev_pm_qos_add_request(dev_name(dev), type, value);
+	if (!ret) {
+		req->dev = dev;
+		req->type = type;
+		ret = apply_constraint(req, PM_QOS_ADD_REQ, value);
+	}
+	return ret;
+}
+
 /**
  * dev_pm_qos_add_request - inserts new qos request into the list
  * @dev: target device for the constraint
@@ -291,31 +345,11 @@ void dev_pm_qos_constraints_destroy(struct device *dev)
 int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req,
 			   enum dev_pm_qos_req_type type, s32 value)
 {
-	int ret = 0;
-
-	if (!dev || !req) /*guard against callers passing in null */
-		return -EINVAL;
-
-	if (WARN(dev_pm_qos_request_active(req),
-		 "%s() called for already added request\n", __func__))
-		return -EINVAL;
+	int ret;
 
 	mutex_lock(&dev_pm_qos_mtx);
-
-	if (IS_ERR(dev->power.qos))
-		ret = -ENODEV;
-	else if (!dev->power.qos)
-		ret = dev_pm_qos_constraints_allocate(dev);
-
-	trace_dev_pm_qos_add_request(dev_name(dev), type, value);
-	if (!ret) {
-		req->dev = dev;
-		req->type = type;
-		ret = apply_constraint(req, PM_QOS_ADD_REQ, value);
-	}
-
+	ret = __dev_pm_qos_add_request(dev, req, type, value);
 	mutex_unlock(&dev_pm_qos_mtx);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dev_pm_qos_add_request);
@@ -343,6 +377,7 @@ static int __dev_pm_qos_update_request(struct dev_pm_qos_request *req,
 
 	switch(req->type) {
 	case DEV_PM_QOS_RESUME_LATENCY:
+	case DEV_PM_QOS_LATENCY_TOLERANCE:
 		curr_value = req->data.pnode.prio;
 		break;
 	case DEV_PM_QOS_FLAGS:
@@ -563,6 +598,10 @@ static void __dev_pm_qos_drop_user_request(struct device *dev,
 		req = dev->power.qos->resume_latency_req;
 		dev->power.qos->resume_latency_req = NULL;
 		break;
+	case DEV_PM_QOS_LATENCY_TOLERANCE:
+		req = dev->power.qos->latency_tolerance_req;
+		dev->power.qos->latency_tolerance_req = NULL;
+		break;
 	case DEV_PM_QOS_FLAGS:
 		req = dev->power.qos->flags_req;
 		dev->power.qos->flags_req = NULL;
@@ -768,6 +807,67 @@ int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set)
 	pm_runtime_put(dev);
 	return ret;
 }
+
+/**
+ * dev_pm_qos_get_user_latency_tolerance - Get user space latency tolerance.
+ * @dev: Device to obtain the user space latency tolerance for.
+ */
+s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
+{
+	s32 ret;
+
+	mutex_lock(&dev_pm_qos_mtx);
+	ret = IS_ERR_OR_NULL(dev->power.qos)
+		|| !dev->power.qos->latency_tolerance_req ?
+			PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT :
+			dev->power.qos->latency_tolerance_req->data.pnode.prio;
+	mutex_unlock(&dev_pm_qos_mtx);
+	return ret;
+}
+
+/**
+ * dev_pm_qos_update_user_latency_tolerance - Update user space latency tolerance.
+ * @dev: Device to update the user space latency tolerance for.
+ * @val: New user space latency tolerance for @dev (negative values disable).
+ */
+int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
+{
+	int ret;
+
+	mutex_lock(&dev_pm_qos_mtx);
+
+	if (IS_ERR_OR_NULL(dev->power.qos)
+	    || !dev->power.qos->latency_tolerance_req) {
+		struct dev_pm_qos_request *req;
+
+		if (val < 0) {
+			ret = -EINVAL;
+			goto out;
+		}
+		req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = __dev_pm_qos_add_request(dev, req, DEV_PM_QOS_LATENCY_TOLERANCE, val);
+		if (ret < 0) {
+			kfree(req);
+			goto out;
+		}
+		dev->power.qos->latency_tolerance_req = req;
+	} else {
+		if (val < 0) {
+			__dev_pm_qos_drop_user_request(dev, DEV_PM_QOS_LATENCY_TOLERANCE);
+			ret = 0;
+		} else {
+			ret = __dev_pm_qos_update_request(dev->power.qos->latency_tolerance_req, val);
+		}
+	}
+
+ out:
+	mutex_unlock(&dev_pm_qos_mtx);
+	return ret;
+}
 #else /* !CONFIG_PM_RUNTIME */
 static void __dev_pm_qos_hide_latency_limit(struct device *dev) {}
 static void __dev_pm_qos_hide_flags(struct device *dev) {}
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 4e24955aac8a..95b181d1ca6d 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -246,6 +246,40 @@ static ssize_t pm_qos_resume_latency_store(struct device *dev,
 static DEVICE_ATTR(pm_qos_resume_latency_us, 0644,
 		   pm_qos_resume_latency_show, pm_qos_resume_latency_store);
 
+static ssize_t pm_qos_latency_tolerance_show(struct device *dev,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	s32 value = dev_pm_qos_get_user_latency_tolerance(dev);
+
+	if (value < 0)
+		return sprintf(buf, "auto\n");
+	else if (value == PM_QOS_LATENCY_ANY)
+		return sprintf(buf, "any\n");
+
+	return sprintf(buf, "%d\n", value);
+}
+
+static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t n)
+{
+	s32 value;
+	int ret;
+
+	if (kstrtos32(buf, 0, &value)) {
+		if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
+			value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
+		else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
+			value = PM_QOS_LATENCY_ANY;
+	}
+	ret = dev_pm_qos_update_user_latency_tolerance(dev, value);
+	return ret < 0 ? ret : n;
+}
+
+static DEVICE_ATTR(pm_qos_latency_tolerance_us, 0644,
+		   pm_qos_latency_tolerance_show, pm_qos_latency_tolerance_store);
+
 static ssize_t pm_qos_no_power_off_show(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
@@ -631,6 +665,17 @@ static struct attribute_group pm_qos_resume_latency_attr_group = {
 	.attrs	= pm_qos_resume_latency_attrs,
 };
 
+static struct attribute *pm_qos_latency_tolerance_attrs[] = {
+#ifdef CONFIG_PM_RUNTIME
+	&dev_attr_pm_qos_latency_tolerance_us.attr,
+#endif /* CONFIG_PM_RUNTIME */
+	NULL,
+};
+static struct attribute_group pm_qos_latency_tolerance_attr_group = {
+	.name	= power_group_name,
+	.attrs	= pm_qos_latency_tolerance_attrs,
+};
+
 static struct attribute *pm_qos_flags_attrs[] = {
 #ifdef CONFIG_PM_RUNTIME
 	&dev_attr_pm_qos_no_power_off.attr,
@@ -656,18 +701,23 @@ int dpm_sysfs_add(struct device *dev)
 		if (rc)
 			goto err_out;
 	}
-
 	if (device_can_wakeup(dev)) {
 		rc = sysfs_merge_group(&dev->kobj, &pm_wakeup_attr_group);
-		if (rc) {
-			if (pm_runtime_callbacks_present(dev))
-				sysfs_unmerge_group(&dev->kobj,
-						    &pm_runtime_attr_group);
-			goto err_out;
-		}
+		if (rc)
+			goto err_runtime;
+	}
+	if (dev->power.set_latency_tolerance) {
+		rc = sysfs_merge_group(&dev->kobj,
+				       &pm_qos_latency_tolerance_attr_group);
+		if (rc)
+			goto err_wakeup;
 	}
 	return 0;
 
+ err_wakeup:
+	sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group);
+ err_runtime:
+	sysfs_unmerge_group(&dev->kobj, &pm_runtime_attr_group);
  err_out:
 	sysfs_remove_group(&dev->kobj, &pm_attr_group);
 	return rc;
@@ -710,6 +760,7 @@ void rpm_sysfs_remove(struct device *dev)
 
 void dpm_sysfs_remove(struct device *dev)
 {
+	sysfs_unmerge_group(&dev->kobj, &pm_qos_latency_tolerance_attr_group);
 	dev_pm_qos_constraints_destroy(dev);
 	rpm_sysfs_remove(dev);
 	sysfs_unmerge_group(&dev->kobj, &pm_wakeup_attr_group);
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 8c6583a53a06..db2be5f3e030 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -582,6 +582,7 @@ struct dev_pm_info {
 	unsigned long		accounting_timestamp;
 #endif
 	struct pm_subsys_data	*subsys_data;  /* Owned by the subsystem. */
+	void (*set_latency_tolerance)(struct device *, s32);
 	struct dev_pm_qos	*qos;
 };
 
diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h
index 2d8ce50877d8..0b476019be55 100644
--- a/include/linux/pm_qos.h
+++ b/include/linux/pm_qos.h
@@ -33,6 +33,9 @@ enum pm_qos_flags_status {
 #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE	(2000 * USEC_PER_SEC)
 #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE	0
 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE	0
+#define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE	0
+#define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT	(-1)
+#define PM_QOS_LATENCY_ANY			((s32)(~(__u32)0 >> 1))
 
 #define PM_QOS_FLAG_NO_POWER_OFF	(1 << 0)
 #define PM_QOS_FLAG_REMOTE_WAKEUP	(1 << 1)
@@ -50,6 +53,7 @@ struct pm_qos_flags_request {
 
 enum dev_pm_qos_req_type {
 	DEV_PM_QOS_RESUME_LATENCY = 1,
+	DEV_PM_QOS_LATENCY_TOLERANCE,
 	DEV_PM_QOS_FLAGS,
 };
 
@@ -89,8 +93,10 @@ struct pm_qos_flags {
 
 struct dev_pm_qos {
 	struct pm_qos_constraints resume_latency;
+	struct pm_qos_constraints latency_tolerance;
 	struct pm_qos_flags flags;
 	struct dev_pm_qos_request *resume_latency_req;
+	struct dev_pm_qos_request *latency_tolerance_req;
 	struct dev_pm_qos_request *flags_req;
 };
 
@@ -196,6 +202,8 @@ void dev_pm_qos_hide_latency_limit(struct device *dev);
 int dev_pm_qos_expose_flags(struct device *dev, s32 value);
 void dev_pm_qos_hide_flags(struct device *dev);
 int dev_pm_qos_update_flags(struct device *dev, s32 mask, bool set);
+s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev);
+int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val);
 
 static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev)
 {
@@ -215,6 +223,10 @@ static inline int dev_pm_qos_expose_flags(struct device *dev, s32 value)
 static inline void dev_pm_qos_hide_flags(struct device *dev) {}
 static inline int dev_pm_qos_update_flags(struct device *dev, s32 m, bool set)
 			{ return 0; }
+static inline s32 dev_pm_qos_get_user_latency_tolerance(struct device *dev)
+			{ return PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; }
+static inline int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
+			{ return 0; }
 
 static inline s32 dev_pm_qos_requested_resume_latency(struct device *dev) { return 0; }
 static inline s32 dev_pm_qos_requested_flags(struct device *dev) { return 0; }
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index e23ae38e647f..884b77058864 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -173,6 +173,7 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 {
 	unsigned long flags;
 	int prev_value, curr_value, new_value;
+	int ret;
 
 	spin_lock_irqsave(&pm_qos_lock, flags);
 	prev_value = pm_qos_get_value(c);
@@ -208,13 +209,15 @@ int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
 
 	trace_pm_qos_update_target(action, prev_value, curr_value);
 	if (prev_value != curr_value) {
-		blocking_notifier_call_chain(c->notifiers,
-					     (unsigned long)curr_value,
-					     NULL);
-		return 1;
+		ret = 1;
+		if (c->notifiers)
+			blocking_notifier_call_chain(c->notifiers,
+						     (unsigned long)curr_value,
+						     NULL);
 	} else {
-		return 0;
+		ret = 0;
 	}
+	return ret;
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 38033c37faab850ed5d33bb675c4de6c66be84d8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 23 Jan 2014 20:32:21 +0100
Subject: sched: Push down pre_schedule() and idle_balance()

This patch both merged idle_balance() and pre_schedule() and pushes
both of them into pick_next_task().

Conceptually pre_schedule() and idle_balance() are rather similar,
both are used to pull more work onto the current CPU.

We cannot however first move idle_balance() into pre_schedule_fair()
since there is no guarantee the last runnable task is a fair task, and
thus we would miss newidle balances.

Similarly, the dl and rt pre_schedule calls must be ran before
idle_balance() since their respective tasks have higher priority and
it would not do to delay their execution searching for less important
tasks first.

However, by noticing that pick_next_tasks() already traverses the
sched_class hierarchy in the right order, we can get the right
behaviour and do away with both calls.

We must however change the special case optimization to also require
that prev is of sched_class_fair, otherwise we can miss doing a dl or
rt pull where we needed one.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-a8k6vvaebtn64nie345kx1je@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 26 ++------------------------
 kernel/sched/deadline.c  | 15 +++++++--------
 kernel/sched/fair.c      | 26 ++++++++++++++++++++++----
 kernel/sched/idle_task.c | 12 +++++-------
 kernel/sched/rt.c        | 16 ++++++++--------
 kernel/sched/sched.h     |  1 -
 6 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dedb5f07666e..3068f37f7c5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 
 #ifdef CONFIG_SMP
 
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-	if (prev->sched_class->pre_schedule)
-		prev->sched_class->pre_schedule(rq, prev);
-}
-
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -2193,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)
 
 #else
 
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2592,7 +2581,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+	if (likely(prev->sched_class == &fair_sched_class &&
+		   rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
@@ -2695,18 +2685,6 @@ need_resched:
 		switch_count = &prev->nvcsw;
 	}
 
-	pre_schedule(rq, prev);
-
-	if (unlikely(!rq->nr_running)) {
-		/*
-		 * We must set idle_stamp _before_ calling idle_balance(), such
-		 * that we measure the duration of idle_balance() as idle time.
-		 */
-		rq->idle_stamp = rq_clock(rq);
-		if (idle_balance(rq))
-			rq->idle_stamp = 0;
-	}
-
 	if (prev->on_rq || rq->skip_clock_update < 0)
 		update_rq_clock(rq);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 50797d576080..ed31ef66ab9d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 }
 
+static int pull_dl_task(struct rq *this_rq);
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -998,6 +1000,11 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 
 	dl_rq = &rq->dl;
 
+#ifdef CONFIG_SMP
+	if (dl_task(prev))
+		pull_dl_task(rq);
+#endif
+
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
@@ -1429,13 +1436,6 @@ skip:
 	return ret;
 }
 
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
-	/* Try to pull other tasks here */
-	if (dl_task(prev))
-		pull_dl_task(rq);
-}
-
 static void post_schedule_dl(struct rq *rq)
 {
 	push_dl_tasks(rq);
@@ -1628,7 +1628,6 @@ const struct sched_class dl_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.pre_schedule		= pre_schedule_dl,
 	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a81b241ff70f..43b49fe077ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2577,7 +2577,8 @@ void idle_exit_fair(struct rq *this_rq)
 	update_rq_runnable_avg(this_rq, 0);
 }
 
-#else
+#else /* CONFIG_SMP */
+
 static inline void update_entity_load_avg(struct sched_entity *se,
 					  int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2589,7 +2590,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 					   int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 					      int force_update) {}
-#endif
+#endif /* CONFIG_SMP */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -4682,9 +4683,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 	struct sched_entity *se;
 	struct task_struct *p;
 
+again: __maybe_unused
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (!cfs_rq->nr_running)
-		return NULL;
+		goto idle;
 
 	if (!prev || prev->sched_class != &fair_sched_class)
 		goto simple;
@@ -4760,7 +4762,7 @@ simple:
 #endif
 
 	if (!cfs_rq->nr_running)
-		return NULL;
+		goto idle;
 
 	if (prev)
 		prev->sched_class->put_prev_task(rq, prev);
@@ -4777,6 +4779,22 @@ simple:
 		hrtick_start_fair(rq, p);
 
 	return p;
+
+idle:
+#ifdef CONFIG_SMP
+	idle_enter_fair(rq);
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	rq->idle_stamp = rq_clock(rq);
+	if (idle_balance(rq)) { /* drops rq->lock */
+		rq->idle_stamp = 0;
+		goto again;
+	}
+#endif
+
+	return NULL;
 }
 
 /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 721371bf03bd..f7d03af79a5b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,13 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
-
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
-	idle_exit_fair(rq);
-	rq_last_tick_reset(rq);
-}
 #endif /* CONFIG_SMP */
+
 /*
  * Idle tasks are unconditionally rescheduled:
  */
@@ -56,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+	idle_exit_fair(rq);
+	rq_last_tick_reset(rq);
+#endif
 }
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -99,7 +98,6 @@ const struct sched_class idle_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
-	.pre_schedule		= pre_schedule_idle,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a15ca1c0c7bf..72f9ec759972 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
+static int pull_rt_task(struct rq *this_rq);
+
 static inline int rt_overloaded(struct rq *rq)
 {
 	return atomic_read(&rq->rd->rto_count);
@@ -1330,6 +1332,12 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
 
+#ifdef CONFIG_SMP
+	/* Try to pull RT tasks here if we lower this rq's prio */
+	if (rq->rt.highest_prio.curr > prev->prio)
+		pull_rt_task(rq);
+#endif
+
 	if (!rt_rq->rt_nr_running)
 		return NULL;
 
@@ -1721,13 +1729,6 @@ skip:
 	return ret;
 }
 
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (rq->rt.highest_prio.curr > prev->prio)
-		pull_rt_task(rq);
-}
-
 static void post_schedule_rt(struct rq *rq)
 {
 	push_rt_tasks(rq);
@@ -2004,7 +2005,6 @@ const struct sched_class rt_sched_class = {
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.pre_schedule		= pre_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c534cf4181ab..1bf34c257d3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,7 +1118,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
-- 
cgit v1.3-14-g43fede


From 27f17580fd2c7514c8f5cce22ab903c6f3ddf458 Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 29 Jan 2014 14:29:33 +0000
Subject: sched: Delete is_same_group() outside CONFIG_FAIR_GROUP_SCHED

Since is_same_group() is only used in the group scheduling code, there is
no need to define it outside CONFIG_FAIR_GROUP_SCHED.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391005773-29493-1-git-send-email-dietmar.eggemann@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 43b49fe077ab..235cfa7ad8fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -415,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline struct cfs_rq *
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-	return cfs_rq_of(se); /* always the same rq */
-}
-
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
 	return NULL;
-- 
cgit v1.3-14-g43fede


From 37e6bae8395a94b4dd934c92b02b9408be992365 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linaro.org>
Date: Thu, 23 Jan 2014 18:39:54 +0800
Subject: sched: Add statistic for newidle load balance cost

Tracking rq->max_idle_balance_cost and sd->max_newidle_lb_cost.
It's useful to know these values in debug mode.

Signed-off-by: Alex Shi <alex.shi@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/52E0F3BF.5020904@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 9 ++++++---
 kernel/sched/debug.c | 1 +
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3068f37f7c5f..fb9764fbc537 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4811,7 +4811,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(13);
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
 
 	if (table == NULL)
 		return NULL;
@@ -4839,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax, false);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax, false);
-	set_table_entry(&table[11], "name", sd->name,
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
 		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-	/* &table[12] is terminator */
+	/* &table[13] is terminator */
 
 	return table;
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 31b908daaa1b..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do {									\
 	P(sched_goidle);
 #ifdef CONFIG_SMP
 	P64(avg_idle);
+	P64(max_idle_balance_cost);
 #endif
 
 	P(ttwu_count);
-- 
cgit v1.3-14-g43fede


From af8cd8ef726f335815233d03b8723e9c52041edd Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Wed, 29 Jan 2014 15:31:36 -0500
Subject: sched/idle: Move the cpuidle entry point to the generic idle loop

In order to integrate cpuidle with the scheduler, we must have a better
proximity in the core code with what cpuidle is doing and not delegate
such interaction to arch code.

Architectures implementing arch_cpu_idle() should simply enter
a cheap idle mode in the absence of a proper cpuidle driver.

In both cases i.e. whether it is a cpuidle driver or the default
arch_cpu_idle(), the calling convention expects IRQs to be disabled
on entry and enabled on exit. There is a warning in place already but
let's add a forced IRQ enable here as well.  This will allow for
removing the forced IRQ enable some implementations do locally and
allowing for the warning to trig.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Olof Johansson <olof@lixom.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1401291526320.1652@knanqh.ubzr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu/idle.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,6 +3,7 @@
  */
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
 				if (!current_clr_polling_and_test()) {
 					stop_critical_timings();
 					rcu_idle_enter();
-					arch_cpu_idle();
-					WARN_ON_ONCE(irqs_disabled());
+					if (cpuidle_idle_call())
+						arch_cpu_idle();
+					if (WARN_ON_ONCE(irqs_disabled()))
+						local_irq_enable();
 					rcu_idle_exit();
 					start_critical_timings();
 				} else {
-- 
cgit v1.3-14-g43fede


From cf37b6b48428d6be8f8762b3599d529c44644fb2 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nicolas.pitre@linaro.org>
Date: Sun, 26 Jan 2014 23:42:01 -0500
Subject: sched/idle: Move cpu/idle.c to sched/idle.c

Integration of cpuidle with the scheduler requires that the idle loop be
closely integrated with the scheduler proper. Moving cpu/idle.c into the
sched directory will allow for a smoother integration, and eliminate a
subdirectory which contained only one source file.

Signed-off-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/alpine.LFD.2.11.1401301102210.1652@knanqh.ubzr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/Makefile       |   1 -
 kernel/cpu/Makefile   |   1 -
 kernel/sched/Makefile |   2 +-
 kernel/sched/idle.c   | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 145 insertions(+), 3 deletions(-)
 delete mode 100644 kernel/cpu/Makefile
 create mode 100644 kernel/sched/idle.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..6f1c7e5cfca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
 obj-y += locking/
 obj-y += power/
 obj-y += printk/
-obj-y += cpu/
 obj-y += irq/
 obj-y += rcu/
 
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y	= idle.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..14ca43430aee
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,144 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+	if (enable) {
+		cpu_idle_force_poll++;
+	} else {
+		cpu_idle_force_poll--;
+		WARN_ON_ONCE(cpu_idle_force_poll < 0);
+	}
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+	cpu_idle_force_poll = 1;
+	return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+	cpu_idle_force_poll = 0;
+	return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+	rcu_idle_enter();
+	trace_cpu_idle_rcuidle(0, smp_processor_id());
+	local_irq_enable();
+	while (!tif_need_resched())
+		cpu_relax();
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	rcu_idle_exit();
+	return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+	cpu_idle_force_poll = 1;
+	local_irq_enable();
+}
+
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+	while (1) {
+		tick_nohz_idle_enter();
+
+		while (!need_resched()) {
+			check_pgt_cache();
+			rmb();
+
+			if (cpu_is_offline(smp_processor_id()))
+				arch_cpu_idle_dead();
+
+			local_irq_disable();
+			arch_cpu_idle_enter();
+
+			/*
+			 * In poll mode we reenable interrupts and spin.
+			 *
+			 * Also if we detected in the wakeup from idle
+			 * path that the tick broadcast device expired
+			 * for us, we don't want to go deep idle as we
+			 * know that the IPI is going to arrive right
+			 * away
+			 */
+			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+				cpu_idle_poll();
+			} else {
+				if (!current_clr_polling_and_test()) {
+					stop_critical_timings();
+					rcu_idle_enter();
+					if (cpuidle_idle_call())
+						arch_cpu_idle();
+					if (WARN_ON_ONCE(irqs_disabled()))
+						local_irq_enable();
+					rcu_idle_exit();
+					start_critical_timings();
+				} else {
+					local_irq_enable();
+				}
+				__current_set_polling();
+			}
+			arch_cpu_idle_exit();
+			/*
+			 * We need to test and propagate the TIF_NEED_RESCHED
+			 * bit here because we might not have send the
+			 * reschedule IPI to idle tasks.
+			 */
+			if (tif_need_resched())
+				set_preempt_need_resched();
+		}
+		tick_nohz_idle_exit();
+		schedule_preempt_disabled();
+	}
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+	/*
+	 * This #ifdef needs to die, but it's too late in the cycle to
+	 * make this generic (arm and sh have never invoked the canary
+	 * init for the non boot cpus!). Will be fixed in 3.11
+	 */
+#ifdef CONFIG_X86
+	/*
+	 * If we're the non-boot CPU, nothing set the stack canary up
+	 * for us. The boot CPU already has it initialized but no harm
+	 * in doing it again. This is a good place for updating it, as
+	 * we wont ever return from this function (so the invalid
+	 * canaries already on the stack wont ever trigger).
+	 */
+	boot_init_stack_canary();
+#endif
+	__current_set_polling();
+	arch_cpu_idle_prepare();
+	cpu_idle_loop();
+}
-- 
cgit v1.3-14-g43fede


From 2c45aada341121438affc4cb8d5b4cfaa2813d3d Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Mon, 10 Feb 2014 13:39:53 -0500
Subject: genirq: Add missing irq_to_desc export for CONFIG_SPARSE_IRQ=n

In allmodconfig builds for sparc and any other arch which does
not set CONFIG_SPARSE_IRQ, the following will be seen at modpost:

  CC [M]  lib/cpu-notifier-error-inject.o
  CC [M]  lib/pm-notifier-error-inject.o
ERROR: "irq_to_desc" [drivers/gpio/gpio-mcp23s08.ko] undefined!
make[2]: *** [__modpost] Error 1

This happens because commit 3911ff30f5 ("genirq: export
handle_edge_irq() and irq_to_desc()") added one export for it, but
there were actually two instances of it, in an if/else clause for
CONFIG_SPARSE_IRQ.  Add the second one.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: stable@vger.kernel.org	# 3.4+
Link: http://lkml.kernel.org/r/1392057610-11514-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cfd..8ab8e9390297 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -274,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
+EXPORT_SYMBOL(irq_to_desc);
 
 static void free_desc(unsigned int irq)
 {
-- 
cgit v1.3-14-g43fede


From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 11 Feb 2014 16:05:46 +0800
Subject: cgroup: protect modifications to cgroup_idr with cgroup_mutex

Setup cgroupfs like this:
  # mount -t cgroup -o cpuacct xxx /cgroup
  # mkdir /cgroup/sub1
  # mkdir /cgroup/sub2

Then run these two commands:
  # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } &
  # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } &

After seconds you may see this warning:

------------[ cut here ]------------
WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0()
idr_remove called for id=6 which is not allocated.
...
Call Trace:
 [<ffffffff8156063c>] dump_stack+0x7a/0x96
 [<ffffffff810591ac>] warn_slowpath_common+0x8c/0xc0
 [<ffffffff81059296>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff81300aa7>] sub_remove+0x87/0x1b0
 [<ffffffff810f3f02>] ? css_killed_work_fn+0x32/0x1b0
 [<ffffffff81300bf5>] idr_remove+0x25/0xd0
 [<ffffffff810f2bab>] cgroup_destroy_css_killed+0x5b/0xc0
 [<ffffffff810f4000>] css_killed_work_fn+0x130/0x1b0
 [<ffffffff8107cdbc>] process_one_work+0x26c/0x550
 [<ffffffff8107eefe>] worker_thread+0x12e/0x3b0
 [<ffffffff81085f96>] kthread+0xe6/0xf0
 [<ffffffff81570bac>] ret_from_fork+0x7c/0xb0
---[ end trace 2d1577ec10cf80d0 ]---

It's because allocating/removing cgroup ID is not properly synchronized.

The bug was introduced when we converted cgroup_ida to cgroup_idr.
While synchronization is already done inside ida_simple_{get,remove}(),
users are responsible for concurrent calls to idr_{alloc,remove}().

tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from
cgroup_create()").

Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr")
Cc: <stable@vger.kernel.org> #3.12+
Reported-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 34 ++++++++++++++++++----------------
 2 files changed, 20 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5c097596104b..9450f025fe0c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -166,6 +166,8 @@ struct cgroup {
 	 *
 	 * The ID of the root cgroup is always 0, and a new cgroup
 	 * will be assigned with a smallest available ID.
+	 *
+	 * Allocating/Removing ID must be protected by cgroup_mutex.
 	 */
 	int id;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3edf7163b84f..52719ce55dd3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		 * per-subsystem and moved to css->id so that lookups are
 		 * successful until the target css is released.
 		 */
+		mutex_lock(&cgroup_mutex);
 		idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+		mutex_unlock(&cgroup_mutex);
 		cgrp->id = -1;
 
 		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -4167,16 +4169,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	}
 	rcu_assign_pointer(cgrp->name, name);
 
-	/*
-	 * Temporarily set the pointer to NULL, so idr_find() won't return
-	 * a half-baked cgroup.
-	 */
-	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
-	if (cgrp->id < 0) {
-		err = -ENOMEM;
-		goto err_free_name;
-	}
-
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
@@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 */
 	if (!cgroup_lock_live_group(parent)) {
 		err = -ENODEV;
-		goto err_free_id;
+		goto err_free_name;
+	}
+
+	/*
+	 * Temporarily set the pointer to NULL, so idr_find() won't return
+	 * a half-baked cgroup.
+	 */
+	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+	if (cgrp->id < 0) {
+		err = -ENOMEM;
+		goto err_unlock;
 	}
 
 	/* Grab a reference on the superblock so the hierarchy doesn't
@@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 */
 	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
 	if (err < 0)
-		goto err_unlock;
+		goto err_free_id;
 	lockdep_assert_held(&dentry->d_inode->i_mutex);
 
 	cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	return 0;
 
-err_unlock:
-	mutex_unlock(&cgroup_mutex);
-	/* Release the reference count that we took on the superblock */
-	deactivate_super(sb);
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
+	/* Release the reference count that we took on the superblock */
+	deactivate_super(sb);
+err_unlock:
+	mutex_unlock(&cgroup_mutex);
 err_free_name:
 	kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
-- 
cgit v1.3-14-g43fede


From 5a17f543ed6808e9085063277fe46795dea484bd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:47 -0500
Subject: cgroup: improve css_from_dir() into css_tryget_from_dir()

css_from_dir() returns the matching css (cgroup_subsys_state) given a
dentry and subsystem.  The function doesn't pin the css before
returning and requires the caller to be holding RCU read lock or
cgroup_mutex and handling pinning on the caller side.

Given that users of the function are likely to want to pin the
returned css (both existing users do) and that getting and putting
css's are very cheap, there's no reason for the interface to be tricky
like this.

Rename css_from_dir() to css_tryget_from_dir() and make it try to pin
the found css and return it only if pinning succeeded.  The callers
are updated so that they no longer do RCU locking and pinning around
the function and just use the returned css.

This will also ease converting cgroup to kernfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 25 ++++++++++++++++---------
 kernel/events/core.c   | 17 +----------------
 mm/memcontrol.c        | 16 +++++++---------
 4 files changed, 26 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c86ba7ff7a7e..1ba4fc08f776 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -825,8 +825,8 @@ int css_scan_tasks(struct cgroup_subsys_state *css,
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
-					 struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+						struct cgroup_subsys *ss);
 
 #else /* !CONFIG_CGROUPS */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2de8decfd99f..fc2db071d95e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4978,28 +4978,35 @@ static int __init cgroup_disable(char *str)
 __setup("cgroup_disable=", cgroup_disable);
 
 /**
- * css_from_dir - get corresponding css from the dentry of a cgroup dir
+ * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
  * @dentry: directory dentry of interest
  * @ss: subsystem of interest
  *
- * Must be called under cgroup_mutex or RCU read lock.  The caller is
- * responsible for pinning the returned css if it needs to be accessed
- * outside the critical section.
+ * If @dentry is a directory for a cgroup which has @ss enabled on it, try
+ * to get the corresponding css and return it.  If such css doesn't exist
+ * or can't be pinned, an ERR_PTR value is returned.
  */
-struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
-					 struct cgroup_subsys *ss)
+struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
+						struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
-
-	cgroup_assert_mutex_or_rcu_locked();
+	struct cgroup_subsys_state *css;
 
 	/* is @dentry a cgroup dir? */
 	if (!dentry->d_inode ||
 	    dentry->d_inode->i_op != &cgroup_dir_inode_operations)
 		return ERR_PTR(-EBADF);
 
+	rcu_read_lock();
+
 	cgrp = __d_cgrp(dentry);
-	return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT);
+	css = cgroup_css(cgrp, ss);
+
+	if (!css || !css_tryget(css))
+		css = ERR_PTR(-ENOENT);
+
+	rcu_read_unlock();
+	return css;
 }
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 64903731d834..a3c3ab50271a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -370,11 +370,6 @@ perf_cgroup_match(struct perf_event *event)
 				    event->cgrp->css.cgroup);
 }
 
-static inline bool perf_tryget_cgroup(struct perf_event *event)
-{
-	return css_tryget(&event->cgrp->css);
-}
-
 static inline void perf_put_cgroup(struct perf_event *event)
 {
 	css_put(&event->cgrp->css);
@@ -593,9 +588,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	if (!f.file)
 		return -EBADF;
 
-	rcu_read_lock();
-
-	css = css_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
+	css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys);
 	if (IS_ERR(css)) {
 		ret = PTR_ERR(css);
 		goto out;
@@ -604,13 +597,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 	cgrp = container_of(css, struct perf_cgroup, css);
 	event->cgrp = cgrp;
 
-	/* must be done before we fput() the file */
-	if (!perf_tryget_cgroup(event)) {
-		event->cgrp = NULL;
-		ret = -ENOENT;
-		goto out;
-	}
-
 	/*
 	 * all events in a group must monitor
 	 * the same cgroup because a task belongs
@@ -621,7 +607,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		ret = -EINVAL;
 	}
 out:
-	rcu_read_unlock();
 	fdput(f);
 	return ret;
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 04a97bce2270..102ab48ffa13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6183,17 +6183,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,
 	 * automatically removed on cgroup destruction but the removal is
 	 * asynchronous, so take an extra ref on @css.
 	 */
-	rcu_read_lock();
-
+	cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
+					&memory_cgrp_subsys);
 	ret = -EINVAL;
-	cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
-				 &memory_cgrp_subsys);
-	if (cfile_css == css && css_tryget(css))
-		ret = 0;
-
-	rcu_read_unlock();
-	if (ret)
+	if (IS_ERR(cfile_css))
 		goto out_put_cfile;
+	if (cfile_css != css) {
+		css_put(cfile_css);
+		goto out_put_cfile;
+	}
 
 	ret = event->register_event(memcg, event->eventfd, buffer);
 	if (ret)
-- 
cgit v1.3-14-g43fede


From ace2bee8135a3dc725958b8d08c55ee9df813d39 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:47 -0500
Subject: cgroup: introduce cgroup_tree_mutex

Currently cgroup uses combination of inode->i_mutex'es and
cgroup_mutex for synchronization.  With the scheduled kernfs
conversion, i_mutex'es will be removed.  Unfortunately, just using
cgroup_mutex isn't possible.  All kernfs file and syscall operations,
most of which require grabbing cgroup_mutex, will be called with
kernfs active ref held and, if we try to perform kernfs removals under
cgroup_mutex, it can deadlock as kernfs_remove() tries to drain the
target node.

Let's introduce a new outer mutex, cgroup_tree_mutex, which protects
stuff used during hierarchy changing operations - cftypes and all the
operations which may affect the cgroupfs.  It also covers css
association and iteration.  This allows cgroup_css(), for_each_css()
and other css iterators to be called under cgroup_tree_mutex.  The new
mutex will nest above both kernfs's active ref protection and
cgroup_mutex.  By protecting tree modifications with a separate outer
mutex, we can get rid of the forementioned deadlock condition.

Actual file additions and removals now require cgroup_tree_mutex
instead of cgroup_mutex.  Currently, cgroup_tree_mutex is never used
without cgroup_mutex; however, we'll soon add hierarchy modification
sections which are only protected by cgroup_tree_mutex.  In the
future, we might want to make the locking more granular by better
splitting the coverages of the two mutexes.  For now, this should do.

v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 66 +++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 53 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fc2db071d95e..cb20d12cb096 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -67,6 +67,15 @@
  */
 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
 
+/*
+ * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
+ * creation/removal and hierarchy changing operations including cgroup
+ * creation, removal, css association and controller rebinding.  This outer
+ * lock is needed mainly to resolve the circular dependency between kernfs
+ * active ref and cgroup_mutex.  cgroup_tree_mutex nests above both.
+ */
+static DEFINE_MUTEX(cgroup_tree_mutex);
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -84,10 +93,11 @@ static DEFINE_MUTEX(cgroup_mutex);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
-#define cgroup_assert_mutex_or_rcu_locked()				\
+#define cgroup_assert_mutexes_or_rcu_locked()				\
 	rcu_lockdep_assert(rcu_read_lock_held() ||			\
+			   lockdep_is_held(&cgroup_tree_mutex) ||	\
 			   lockdep_is_held(&cgroup_mutex),		\
-			   "cgroup_mutex or RCU read lock required");
+			   "cgroup_[tree_]mutex or RCU read lock required");
 
 /*
  * cgroup destruction makes heavy use of work items and there can be a lot
@@ -179,7 +189,8 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 {
 	if (ss)
 		return rcu_dereference_check(cgrp->subsys[ss->id],
-					     lockdep_is_held(&cgroup_mutex));
+					lockdep_is_held(&cgroup_tree_mutex) ||
+					lockdep_is_held(&cgroup_mutex));
 	else
 		return &cgrp->dummy_css;
 }
@@ -235,6 +246,7 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
 		if (!((css) = rcu_dereference_check(			\
 				(cgrp)->subsys[(ssid)],			\
+				lockdep_is_held(&cgroup_tree_mutex) ||	\
 				lockdep_is_held(&cgroup_mutex)))) { }	\
 		else
 
@@ -883,7 +895,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 	struct cfent *cfe;
 
 	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&cgroup_tree_mutex);
 
 	/*
 	 * If we're doing cleanup due to failure of cgroup_create(),
@@ -948,7 +960,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	struct cgroup_subsys *ss;
 	int i, ret;
 
-	BUG_ON(!mutex_is_locked(&cgroup_mutex));
+	lockdep_assert_held(&cgroup_tree_mutex);
+	lockdep_assert_held(&cgroup_mutex);
 
 	/* Check that any added subsystems are currently free */
 	for_each_subsys(ss, i)
@@ -1220,6 +1233,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	/* See what subsystems are wanted */
@@ -1263,6 +1277,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.release_agent);
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
 }
@@ -1494,6 +1509,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		inode = sb->s_root->d_inode;
 
 		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_tree_mutex);
 		mutex_lock(&cgroup_mutex);
 
 		ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
@@ -1568,6 +1584,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		BUG_ON(root->number_of_cgroups != 1);
 
 		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&cgroup_tree_mutex);
 		mutex_unlock(&inode->i_mutex);
 	} else {
 		/*
@@ -1598,6 +1615,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
  unlock_drop:
 	cgroup_exit_root_id(root);
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	mutex_unlock(&inode->i_mutex);
  drop_new_super:
 	deactivate_locked_super(sb);
@@ -1620,6 +1638,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
@@ -1650,6 +1669,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	cgroup_exit_root_id(root);
 
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 
 	simple_xattrs_free(&cgrp->xattrs);
@@ -2625,7 +2645,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 	int ret;
 
 	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
-	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&cgroup_tree_mutex);
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
@@ -2659,6 +2679,7 @@ static void cgroup_cfts_prepare(void)
 	 * Instead, we use css_for_each_descendant_pre() and drop RCU read
 	 * lock before calling cgroup_addrm_files().
 	 */
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 }
 
@@ -2679,6 +2700,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	if (!cfts || ss->root == &cgroup_dummy_root ||
 	    !atomic_inc_not_zero(&sb->s_active)) {
 		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&cgroup_tree_mutex);
 		return 0;
 	}
 
@@ -2702,7 +2724,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 		prev = cgrp->dentry;
 
 		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&cgroup_tree_mutex);
 		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_tree_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
 			ret = cgroup_addrm_files(cgrp, cfts, is_add);
@@ -2711,6 +2735,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 			break;
 	}
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	dput(prev);
 	deactivate_super(sb);
 	return ret;
@@ -2856,7 +2881,7 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 	struct cgroup *cgrp = parent_css->cgroup;
 	struct cgroup *next;
 
-	cgroup_assert_mutex_or_rcu_locked();
+	cgroup_assert_mutexes_or_rcu_locked();
 
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
@@ -2914,7 +2939,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 {
 	struct cgroup_subsys_state *next;
 
-	cgroup_assert_mutex_or_rcu_locked();
+	cgroup_assert_mutexes_or_rcu_locked();
 
 	/* if first iteration, visit @root */
 	if (!pos)
@@ -2955,7 +2980,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
 {
 	struct cgroup_subsys_state *last, *tmp;
 
-	cgroup_assert_mutex_or_rcu_locked();
+	cgroup_assert_mutexes_or_rcu_locked();
 
 	do {
 		last = pos;
@@ -3003,7 +3028,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 {
 	struct cgroup_subsys_state *next;
 
-	cgroup_assert_mutex_or_rcu_locked();
+	cgroup_assert_mutexes_or_rcu_locked();
 
 	/* if first iteration, visit leftmost descendant which may be @root */
 	if (!pos)
@@ -3977,6 +4002,7 @@ static int online_css(struct cgroup_subsys_state *css)
 	struct cgroup_subsys *ss = css->ss;
 	int ret = 0;
 
+	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
 	if (ss->css_online)
@@ -3994,6 +4020,7 @@ static void offline_css(struct cgroup_subsys_state *css)
 {
 	struct cgroup_subsys *ss = css->ss;
 
+	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
 	if (!(css->flags & CSS_ONLINE))
@@ -4093,6 +4120,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	}
 	rcu_assign_pointer(cgrp->name, name);
 
+	mutex_lock(&cgroup_tree_mutex);
+
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
@@ -4102,7 +4131,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 */
 	if (!cgroup_lock_live_group(parent)) {
 		err = -ENODEV;
-		goto err_free_name;
+		goto err_unlock_tree;
 	}
 
 	/*
@@ -4176,6 +4205,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	}
 
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 
 	return 0;
@@ -4186,7 +4216,8 @@ err_free_id:
 	deactivate_super(sb);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
-err_free_name:
+err_unlock_tree:
+	mutex_unlock(&cgroup_tree_mutex);
 	kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
 	kfree(cgrp);
@@ -4195,6 +4226,7 @@ err_free_cgrp:
 err_destroy:
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	mutex_unlock(&dentry->d_inode->i_mutex);
 	return err;
 }
@@ -4217,6 +4249,7 @@ static void css_killed_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup *cgrp = css->cgroup;
 
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	/*
@@ -4234,6 +4267,7 @@ static void css_killed_work_fn(struct work_struct *work)
 		cgroup_destroy_css_killed(cgrp);
 
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 
 	/*
 	 * Put the css refs from kill_css().  Each css holds an extra
@@ -4321,6 +4355,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	int ssid;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
+	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
 	/*
@@ -4407,6 +4442,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 	struct cgroup *parent = cgrp->parent;
 	struct dentry *d = cgrp->dentry;
 
+	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
 	/* delete this cgroup from parent->children */
@@ -4422,9 +4458,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	int ret;
 
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 	ret = cgroup_destroy_locked(dentry->d_fsdata);
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 
 	return ret;
 }
@@ -4454,6 +4492,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	/* init base cftset */
@@ -4482,6 +4521,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(online_css(css));
 
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 }
 
 /**
@@ -5021,7 +5061,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
 {
 	struct cgroup *cgrp;
 
-	cgroup_assert_mutex_or_rcu_locked();
+	cgroup_assert_mutexes_or_rcu_locked();
 
 	cgrp = idr_find(&ss->root->cgroup_idr, id);
 	if (cgrp)
-- 
cgit v1.3-14-g43fede


From 4ac0601744eb86e982fbdadde35f1945f7ce5882 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:47 -0500
Subject: cgroup: release cgroup_mutex over file removals

Now that cftypes and all tree modification operations are protected by
cgroup_tree_mutex, we can drop cgroup_mutex while deleting files and
directories.  Drop cgroup_mutex over removals.

This doesn't make any noticeable difference now but is to help kernfs
conversion.  In kernfs, removals are sync points which drain in-flight
operations as those operations would grab cgroup_mutex, trying to
delete under cgroup_mutex would deadlock.  This can be resolved by
just holding the outer cgroup_tree_mutex which nests outside both
kernfs active reference and cgroup_mutex.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cb20d12cb096..d28cf75f33c1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -976,7 +976,9 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
+	mutex_unlock(&cgroup_mutex);
 	cgroup_clear_dir(cgrp, removed_mask);
+	mutex_lock(&cgroup_mutex);
 
 	for_each_subsys(ss, i) {
 		unsigned long bit = 1UL << i;
@@ -2696,10 +2698,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	u64 update_before;
 	int ret = 0;
 
+	mutex_unlock(&cgroup_mutex);
+
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
 	if (!cfts || ss->root == &cgroup_dummy_root ||
 	    !atomic_inc_not_zero(&sb->s_active)) {
-		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&cgroup_tree_mutex);
 		return 0;
 	}
@@ -2723,18 +2726,15 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 		dput(prev);
 		prev = cgrp->dentry;
 
-		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&cgroup_tree_mutex);
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_tree_mutex);
-		mutex_lock(&cgroup_mutex);
 		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
 			ret = cgroup_addrm_files(cgrp, cfts, is_add);
 		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			break;
 	}
-	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 	dput(prev);
 	deactivate_super(sb);
@@ -4387,10 +4387,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	/*
 	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
 	 * will be invoked to perform the rest of destruction once the
-	 * percpu refs of all css's are confirmed to be killed.
+	 * percpu refs of all css's are confirmed to be killed.  This
+	 * involves removing the subsystem's files, drop cgroup_mutex.
 	 */
+	mutex_unlock(&cgroup_mutex);
 	for_each_css(css, ssid, cgrp)
 		kill_css(css);
+	mutex_lock(&cgroup_mutex);
 
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
@@ -4421,9 +4424,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * puts the base ref but we aren't quite done with @cgrp yet, so
 	 * hold onto it.
 	 */
+	mutex_unlock(&cgroup_mutex);
 	cgroup_addrm_files(cgrp, cgroup_base_files, false);
 	dget(d);
 	cgroup_d_remove_dir(d);
+	mutex_lock(&cgroup_mutex);
 
 	return 0;
 };
-- 
cgit v1.3-14-g43fede


From 8e30e2b8ba0ee58aa0f442d0b4a3cac1a4f2efb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: restructure locking and error handling in cgroup_mount()

cgroup is scheduled to be converted to kernfs.  After conversion,
cgroup_mount() won't use the sget() machinery for finding out existing
super_blocks but instead would do that directly.  It'll search the
existing cgroupfs_roots for a matching one and create a new one iff a
match doesn't exist.  To ease such conversion, this patch restructures
locking and error handling of the function.

cgroup_tree_mutex and cgroup_mutex are grabbed from the get-go and
held until return.  For now, due to the way vfs locks nest outside
cgroup mutexes, the two cgroup mutexes are temporarily dropped across
sget() and inode mutex locking, which looks quite ridiculous; however,
these will be removed through kernfs conversion and structuring the
code this way makes the conversion less painful.

The error goto labels are consolidated to two.  This looks unwieldy
now but the next patch will factor out creation of new root into a
separate function with accompanying error handling and it'll look a
lot better.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 73 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d28cf75f33c1..083b53d79d6f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1459,21 +1459,22 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
+	LIST_HEAD(tmp_links);
+	struct super_block *sb = NULL;
+	struct inode *inode = NULL;
+	struct cgroupfs_root *root = NULL;
 	struct cgroup_sb_opts opts;
-	struct cgroupfs_root *root;
-	int ret = 0;
-	struct super_block *sb;
 	struct cgroupfs_root *new_root;
-	struct list_head tmp_links;
-	struct inode *inode;
 	const struct cred *cred;
+	int ret;
 
-	/* First find the desired set of subsystems */
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
+
+	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
-	mutex_unlock(&cgroup_mutex);
 	if (ret)
-		goto out_err;
+		goto out_unlock;
 
 	/*
 	 * Allocate a new cgroup root. We may not need it if we're
@@ -1482,16 +1483,20 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	new_root = cgroup_root_from_opts(&opts);
 	if (IS_ERR(new_root)) {
 		ret = PTR_ERR(new_root);
-		goto out_err;
+		goto out_unlock;
 	}
 	opts.new_root = new_root;
 
 	/* Locate an existing or new sb for this hierarchy */
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
+	mutex_lock(&cgroup_tree_mutex);
+	mutex_lock(&cgroup_mutex);
 	if (IS_ERR(sb)) {
 		ret = PTR_ERR(sb);
 		cgroup_free_root(opts.new_root);
-		goto out_err;
+		goto out_unlock;
 	}
 
 	root = sb->s_fs_info;
@@ -1505,9 +1510,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		BUG_ON(sb->s_root != NULL);
 
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&cgroup_tree_mutex);
+
 		ret = cgroup_get_rootdir(sb);
 		if (ret)
-			goto drop_new_super;
+			goto out_unlock;
 		inode = sb->s_root->d_inode;
 
 		mutex_lock(&inode->i_mutex);
@@ -1516,7 +1524,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
 		if (ret < 0)
-			goto unlock_drop;
+			goto out_unlock;
 		root_cgrp->id = ret;
 
 		/* Check for name clashes with existing mounts */
@@ -1524,7 +1532,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		if (strlen(root->name))
 			for_each_active_root(existing_root)
 				if (!strcmp(existing_root->name, root->name))
-					goto unlock_drop;
+					goto out_unlock;
 
 		/*
 		 * We're accessing css_set_count without locking
@@ -1535,12 +1543,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 */
 		ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 		if (ret)
-			goto unlock_drop;
+			goto out_unlock;
 
 		/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
 		ret = cgroup_init_root_id(root, 2, 0);
 		if (ret)
-			goto unlock_drop;
+			goto out_unlock;
 
 		sb->s_root->d_fsdata = root_cgrp;
 		root_cgrp->dentry = sb->s_root;
@@ -1580,14 +1588,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			link_css_set(&tmp_links, cset, root_cgrp);
 		write_unlock(&css_set_lock);
 
-		free_cgrp_cset_links(&tmp_links);
-
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
-
-		mutex_unlock(&cgroup_mutex);
-		mutex_unlock(&cgroup_tree_mutex);
-		mutex_unlock(&inode->i_mutex);
 	} else {
 		/*
 		 * We re-used an existing hierarchy - the new root (if
@@ -1599,32 +1601,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
 				pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
 				ret = -EINVAL;
-				goto drop_new_super;
+				goto out_unlock;
 			} else {
 				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
 			}
 		}
 	}
 
-	kfree(opts.release_agent);
-	kfree(opts.name);
-	return dget(sb->s_root);
+	ret = 0;
+	goto out_unlock;
 
- rm_base_files:
-	free_cgrp_cset_links(&tmp_links);
+rm_base_files:
 	cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
 	revert_creds(cred);
- unlock_drop:
 	cgroup_exit_root_id(root);
+out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&inode->i_mutex);
- drop_new_super:
-	deactivate_locked_super(sb);
- out_err:
+	if (inode)
+		mutex_unlock(&inode->i_mutex);
+
+	if (ret && !IS_ERR_OR_NULL(sb))
+		deactivate_locked_super(sb);
+
+	free_cgrp_cset_links(&tmp_links);
 	kfree(opts.release_agent);
 	kfree(opts.name);
-	return ERR_PTR(ret);
+
+	if (!ret)
+		return dget(sb->s_root);
+	else
+		return ERR_PTR(ret);
 }
 
 static void cgroup_kill_sb(struct super_block *sb)
-- 
cgit v1.3-14-g43fede


From d427dfeb120b92c0c5e2ca9d1ec6952de67ebad9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: factor out cgroup_setup_root() from cgroup_mount()

Factor out new root initialization into cgroup_setup_root() from
cgroup_mount().  This makes it easier to follow and will ease kernfs
conversion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 211 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 113 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 083b53d79d6f..0a178cd1f836 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1455,17 +1455,126 @@ static int cgroup_get_rootdir(struct super_block *sb)
 	return 0;
 }
 
+static int cgroup_setup_root(struct cgroupfs_root *root)
+{
+	LIST_HEAD(tmp_links);
+	struct super_block *sb = root->sb;
+	struct cgroup *root_cgrp = &root->top_cgroup;
+	struct cgroupfs_root *existing_root;
+	struct css_set *cset;
+	struct inode *inode;
+	const struct cred *cred;
+	int i, ret;
+
+	lockdep_assert_held(&cgroup_tree_mutex);
+	lockdep_assert_held(&cgroup_mutex);
+	BUG_ON(sb->s_root != NULL);
+
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
+
+	ret = cgroup_get_rootdir(sb);
+	if (ret) {
+		mutex_lock(&cgroup_tree_mutex);
+		mutex_lock(&cgroup_mutex);
+		return ret;
+	}
+	inode = sb->s_root->d_inode;
+
+	mutex_lock(&inode->i_mutex);
+	mutex_lock(&cgroup_tree_mutex);
+	mutex_lock(&cgroup_mutex);
+
+	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+	if (ret < 0)
+		goto out_unlock;
+	root_cgrp->id = ret;
+
+	/* check for name clashes with existing mounts */
+	ret = -EBUSY;
+	if (strlen(root->name))
+		for_each_active_root(existing_root)
+			if (!strcmp(existing_root->name, root->name))
+				goto out_unlock;
+
+	/*
+	 * We're accessing css_set_count without locking css_set_lock here,
+	 * but that's OK - it can only be increased by someone holding
+	 * cgroup_lock, and that's us. The worst that can happen is that we
+	 * have some link structures left over
+	 */
+	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
+	if (ret)
+		goto out_unlock;
+
+	/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
+	ret = cgroup_init_root_id(root, 2, 0);
+	if (ret)
+		goto out_unlock;
+
+	sb->s_root->d_fsdata = root_cgrp;
+	root_cgrp->dentry = sb->s_root;
+
+	/*
+	 * We're inside get_sb() and will call lookup_one_len() to create
+	 * the root files, which doesn't work if SELinux is in use.  The
+	 * following cred dancing somehow works around it.  See 2ce9738ba
+	 * ("cgroupfs: use init_cred when populating new cgroupfs mount")
+	 * for more details.
+	 */
+	cred = override_creds(&init_cred);
+
+	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+	if (ret)
+		goto rm_base_files;
+
+	ret = rebind_subsystems(root, root->subsys_mask, 0);
+	if (ret)
+		goto rm_base_files;
+
+	revert_creds(cred);
+
+	/*
+	 * There must be no failure case after here, since rebinding takes
+	 * care of subsystems' refcounts, which are explicitly dropped in
+	 * the failure exit path.
+	 */
+	list_add(&root->root_list, &cgroup_roots);
+	cgroup_root_count++;
+
+	/*
+	 * Link the top cgroup in this hierarchy into all the css_set
+	 * objects.
+	 */
+	write_lock(&css_set_lock);
+	hash_for_each(css_set_table, i, cset, hlist)
+		link_css_set(&tmp_links, cset, root_cgrp);
+	write_unlock(&css_set_lock);
+
+	BUG_ON(!list_empty(&root_cgrp->children));
+	BUG_ON(root->number_of_cgroups != 1);
+
+	ret = 0;
+	goto out_unlock;
+
+rm_base_files:
+	cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
+	revert_creds(cred);
+	cgroup_exit_root_id(root);
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	free_cgrp_cset_links(&tmp_links);
+	return ret;
+}
+
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
-	LIST_HEAD(tmp_links);
 	struct super_block *sb = NULL;
-	struct inode *inode = NULL;
 	struct cgroupfs_root *root = NULL;
 	struct cgroup_sb_opts opts;
 	struct cgroupfs_root *new_root;
-	const struct cred *cred;
 	int ret;
 
 	mutex_lock(&cgroup_tree_mutex);
@@ -1502,94 +1611,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	root = sb->s_fs_info;
 	BUG_ON(!root);
 	if (root == opts.new_root) {
-		/* We used the new root structure, so this is a new hierarchy */
-		struct cgroup *root_cgrp = &root->top_cgroup;
-		struct cgroupfs_root *existing_root;
-		int i;
-		struct css_set *cset;
-
-		BUG_ON(sb->s_root != NULL);
-
-		mutex_unlock(&cgroup_mutex);
-		mutex_unlock(&cgroup_tree_mutex);
-
-		ret = cgroup_get_rootdir(sb);
+		ret = cgroup_setup_root(root);
 		if (ret)
 			goto out_unlock;
-		inode = sb->s_root->d_inode;
-
-		mutex_lock(&inode->i_mutex);
-		mutex_lock(&cgroup_tree_mutex);
-		mutex_lock(&cgroup_mutex);
-
-		ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
-		if (ret < 0)
-			goto out_unlock;
-		root_cgrp->id = ret;
-
-		/* Check for name clashes with existing mounts */
-		ret = -EBUSY;
-		if (strlen(root->name))
-			for_each_active_root(existing_root)
-				if (!strcmp(existing_root->name, root->name))
-					goto out_unlock;
-
-		/*
-		 * We're accessing css_set_count without locking
-		 * css_set_lock here, but that's OK - it can only be
-		 * increased by someone holding cgroup_lock, and
-		 * that's us. The worst that can happen is that we
-		 * have some link structures left over
-		 */
-		ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
-		if (ret)
-			goto out_unlock;
-
-		/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
-		ret = cgroup_init_root_id(root, 2, 0);
-		if (ret)
-			goto out_unlock;
-
-		sb->s_root->d_fsdata = root_cgrp;
-		root_cgrp->dentry = sb->s_root;
-
-		/*
-		 * We're inside get_sb() and will call lookup_one_len() to
-		 * create the root files, which doesn't work if SELinux is
-		 * in use.  The following cred dancing somehow works around
-		 * it.  See 2ce9738ba ("cgroupfs: use init_cred when
-		 * populating new cgroupfs mount") for more details.
-		 */
-		cred = override_creds(&init_cred);
-
-		ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
-		if (ret)
-			goto rm_base_files;
-
-		ret = rebind_subsystems(root, root->subsys_mask, 0);
-		if (ret)
-			goto rm_base_files;
-
-		revert_creds(cred);
-
-		/*
-		 * There must be no failure case after here, since rebinding
-		 * takes care of subsystems' refcounts, which are explicitly
-		 * dropped in the failure exit path.
-		 */
-
-		list_add(&root->root_list, &cgroup_roots);
-		cgroup_root_count++;
-
-		/* Link the top cgroup in this hierarchy into all
-		 * the css_set objects */
-		write_lock(&css_set_lock);
-		hash_for_each(css_set_table, i, cset, hlist)
-			link_css_set(&tmp_links, cset, root_cgrp);
-		write_unlock(&css_set_lock);
-
-		BUG_ON(!list_empty(&root_cgrp->children));
-		BUG_ON(root->number_of_cgroups != 1);
 	} else {
 		/*
 		 * We re-used an existing hierarchy - the new root (if
@@ -1609,22 +1633,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	}
 
 	ret = 0;
-	goto out_unlock;
-
-rm_base_files:
-	cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
-	revert_creds(cred);
-	cgroup_exit_root_id(root);
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	if (inode)
-		mutex_unlock(&inode->i_mutex);
 
 	if (ret && !IS_ERR_OR_NULL(sb))
 		deactivate_locked_super(sb);
 
-	free_cgrp_cset_links(&tmp_links);
 	kfree(opts.release_agent);
 	kfree(opts.name);
 
-- 
cgit v1.3-14-g43fede


From 8d7e6fb0a1db970ac3589f87af0f2a20ef46654b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: update cgroup name handling

Straightforward updates to cgroup name handling in preparation of
kernfs conversion.

* cgroup_alloc_name() is updated to take const char * isntead of
  dentry * for name source.

* cgroup name formatting is separated out into cgroup_file_name().
  While at it, buffer length protection is added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0a178cd1f836..3f204429d108 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -67,6 +67,9 @@
  */
 #define CGROUP_PIDLIST_DESTROY_DELAY	HZ
 
+#define CGROUP_FILE_NAME_MAX		(MAX_CGROUP_TYPE_NAMELEN +	\
+					 MAX_CFTYPE_NAME + 2)
+
 /*
  * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
  * creation/removal and hierarchy changing operations including cgroup
@@ -799,17 +802,29 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 	return inode;
 }
 
-static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+static struct cgroup_name *cgroup_alloc_name(const char *name_str)
 {
 	struct cgroup_name *name;
 
-	name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+	name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL);
 	if (!name)
 		return NULL;
-	strcpy(name->name, dentry->d_name.name);
+	strcpy(name->name, name_str);
 	return name;
 }
 
+static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
+			      char *buf)
+{
+	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
+	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
+		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
+			 cft->ss->name, cft->name);
+	else
+		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+	return buf;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -2437,7 +2452,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 
-	name = cgroup_alloc_name(new_dentry);
+	name = cgroup_alloc_name(new_dentry->d_name.name);
 	if (!name)
 		return -ENOMEM;
 
@@ -2613,14 +2628,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 	struct cfent *cfe;
 	int error;
 	umode_t mode;
-	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-
-	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
-	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
-		strcpy(name, cft->ss->name);
-		strcat(name, ".");
-	}
-	strcat(name, cft->name);
+	char name[CGROUP_FILE_NAME_MAX];
 
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 
@@ -2628,6 +2636,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 	if (!cfe)
 		return -ENOMEM;
 
+	cgroup_file_name(cgrp, cft, name);
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
@@ -4135,7 +4144,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (!cgrp)
 		return -ENOMEM;
 
-	name = cgroup_alloc_name(dentry);
+	name = cgroup_alloc_name(dentry->d_name.name);
 	if (!name) {
 		err = -ENOMEM;
 		goto err_free_cgrp;
-- 
cgit v1.3-14-g43fede


From de00ffa56ea3132c6013fc8f07133b8a1014cf53 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes()

Currently, cgroup_subsys->base_cftypes registration is different from
dynamic cftypes registartion.  Instead of going through
cgroup_add_cftypes(), cgroup_init_subsys() invokes
cgroup_init_cftsets() which makes use of cgroup_subsys->base_cftset
which doesn't involve dynamic allocation.

While avoiding dynamic allocation is somewhat nice, having two
separate paths for cftypes registration is nasty, especially as we're
planning to add more operations during cftypes registration.

This patch drops cgroup_init_cftsets() and cgroup_subsys->base_cftset
and registers base_cftypes using cgroup_add_cftypes().  This is done
as a separate step in cgroup_init() instead of a part of
cgroup_init_subsys().  This is because cgroup_init_subsys() can be
called very early during boot when kmalloc() isn't available yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  3 +--
 kernel/cgroup.c        | 29 ++++++++---------------------
 2 files changed, 9 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1ba4fc08f776..c4350a82320b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -604,9 +604,8 @@ struct cgroup_subsys {
 	/* list of cftype_sets */
 	struct list_head cftsets;
 
-	/* base cftypes, automatically [de]registered with subsys itself */
+	/* base cftypes, automatically registered with subsys itself */
 	struct cftype *base_cftypes;
-	struct cftype_set base_cftset;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f204429d108..eb002c622cd6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4503,25 +4503,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	return ret;
 }
 
-static void __init cgroup_init_cftsets(struct cgroup_subsys *ss)
-{
-	INIT_LIST_HEAD(&ss->cftsets);
-
-	/*
-	 * base_cftset is embedded in subsys itself, no need to worry about
-	 * deregistration.
-	 */
-	if (ss->base_cftypes) {
-		struct cftype *cft;
-
-		for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
-			cft->ss = ss;
-
-		ss->base_cftset.cfts = ss->base_cftypes;
-		list_add_tail(&ss->base_cftset.node, &ss->cftsets);
-	}
-}
-
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
@@ -4531,8 +4512,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
-	/* init base cftset */
-	cgroup_init_cftsets(ss);
+	INIT_LIST_HEAD(&ss->cftsets);
 
 	/* Create the top cgroup state for this subsystem */
 	ss->root = &cgroup_dummy_root;
@@ -4621,6 +4601,13 @@ int __init cgroup_init(void)
 	for_each_subsys(ss, i) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+
+		/*
+		 * cftype registration needs kmalloc and can't be done
+		 * during early_init.  Register base cftypes separately.
+		 */
+		if (ss->base_cftypes)
+			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
 	}
 
 	/* allocate id for the dummy hierarchy */
-- 
cgit v1.3-14-g43fede


From 5f46990787e2721b4db190ddc8af6fdbe8f010d7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: update the meaning of cftype->max_write_len

cftype->max_write_len is used to extend the maximum size of writes.
It's interpreted in such a way that the actual maximum size is one
less than the specified value.  The default size is defined by
CGROUP_LOCAL_BUFFER_SIZE.  Its interpretation is quite confusing - its
value is decremented by 1 and then compared for equality with max
size, which means that the actual default size is
CGROUP_LOCAL_BUFFER_SIZE - 2, which is 62 chars.

There's no point in having a limit that low.  Update its definition so
that it means the actual string length sans termination and anything
below PAGE_SIZE-1 is treated as PAGE_SIZE-1.

.max_write_len for "release_agent" is updated to PATH_MAX-1 and
cgroup_release_agent_write() is updated so that the redundant strlen()
check is removed and it uses strlcpy() instead of strcpy().
.max_write_len initializations in blk-throttle.c and cfq-iosched.c are
no longer necessary and removed.  The one in cpuset is kept unchanged
as it's an approximated value to begin with.

This will also make transition to kernfs smoother.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 block/blk-throttle.c   |  4 ----
 block/cfq-iosched.c    |  3 ---
 include/linux/cgroup.h |  5 +++--
 kernel/cgroup.c        | 18 ++++++++----------
 4 files changed, 11 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1474c3ab7e72..861c363e4129 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = {
 		.private = offsetof(struct throtl_grp, bps[READ]),
 		.seq_show = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
-		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_bps_device",
 		.private = offsetof(struct throtl_grp, bps[WRITE]),
 		.seq_show = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
-		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.read_iops_device",
 		.private = offsetof(struct throtl_grp, iops[READ]),
 		.seq_show = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
-		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_iops_device",
 		.private = offsetof(struct throtl_grp, iops[WRITE]),
 		.seq_show = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
-		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.io_service_bytes",
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 744833b630c6..461187943392 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = {
 		.flags = CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cfqg_print_leaf_weight_device,
 		.write_string = cfqg_set_leaf_weight_device,
-		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
@@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = {
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = cfqg_print_weight_device,
 		.write_string = cfqg_set_weight_device,
-		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
@@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = {
 		.name = "leaf_weight_device",
 		.seq_show = cfqg_print_leaf_weight_device,
 		.write_string = cfqg_set_leaf_weight_device,
-		.max_write_len = 256,
 	},
 	{
 		.name = "leaf_weight",
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c4350a82320b..63feaed83bc6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -400,8 +400,9 @@ struct cftype {
 	umode_t mode;
 
 	/*
-	 * If non-zero, defines the maximum length of string that can
-	 * be passed to write_string; defaults to 64
+	 * The maximum length of string, excluding trailing nul, that can
+	 * be passed to write_string.  If < PAGE_SIZE-1, PAGE_SIZE-1 is
+	 * assumed.
 	 */
 	size_t max_write_len;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index eb002c622cd6..fde3633ef389 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2213,13 +2213,14 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, const char *buffer)
 {
-	BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX);
-	if (strlen(buffer) >= PATH_MAX)
-		return -EINVAL;
+	struct cgroupfs_root *root = css->cgroup->root;
+
+	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
 	if (!cgroup_lock_live_group(css->cgroup))
 		return -ENODEV;
 	spin_lock(&release_agent_path_lock);
-	strcpy(css->cgroup->root->release_agent_path, buffer);
+	strlcpy(root->release_agent_path, buffer,
+		sizeof(root->release_agent_path));
 	spin_unlock(&release_agent_path_lock);
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -2245,20 +2246,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-/* A buffer size big enough for numbers or short strings */
-#define CGROUP_LOCAL_BUFFER_SIZE 64
-
 static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
 				 size_t nbytes, loff_t *ppos)
 {
 	struct cfent *cfe = __d_cfe(file->f_dentry);
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup_subsys_state *css = cfe->css;
-	size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1;
+	size_t max_bytes = max(cft->max_write_len, PAGE_SIZE);
 	char *buf;
 	int ret;
 
-	if (nbytes >= max_bytes)
+	if (nbytes > max_bytes)
 		return -E2BIG;
 
 	buf = kmalloc(nbytes + 1, GFP_KERNEL);
@@ -3919,7 +3917,7 @@ static struct cftype cgroup_base_files[] = {
 		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
 		.seq_show = cgroup_release_agent_show,
 		.write_string = cgroup_release_agent_write,
-		.max_write_len = PATH_MAX,
+		.max_write_len = PATH_MAX - 1,
 	},
 	{ }	/* terminate */
 };
-- 
cgit v1.3-14-g43fede


From 2da440a26ce4743bd3e71ba964ba3f983d09bba5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:48 -0500
Subject: cgroup: introduce cgroup_init/exit_cftypes()

Factor out cft->ss initialization into cgroup_init_cftypes() from
cgroup_add_cftypes() and add cft->ss clearing to cgroup_rm_cftypes()
through cgroup_exit_cftypes().

This doesn't make any meaningful difference now but the two new
functions will be expanded during kernfs transition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fde3633ef389..42e588ef62d1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2770,6 +2770,22 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	return ret;
 }
 
+static void cgroup_exit_cftypes(struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++)
+		cft->ss = NULL;
+}
+
+static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+	struct cftype *cft;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++)
+		cft->ss = ss;
+}
+
 /**
  * cgroup_add_cftypes - add an array of cftypes to a subsystem
  * @ss: target cgroup subsystem
@@ -2787,15 +2803,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype_set *set;
-	struct cftype *cft;
 	int ret;
 
 	set = kzalloc(sizeof(*set), GFP_KERNEL);
 	if (!set)
 		return -ENOMEM;
 
-	for (cft = cfts; cft->name[0] != '\0'; cft++)
-		cft->ss = ss;
+	cgroup_init_cftypes(ss, cfts);
 
 	cgroup_cfts_prepare();
 	set->cfts = cfts;
@@ -2820,6 +2834,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
+	struct cftype *found = NULL;
 	struct cftype_set *set;
 
 	if (!cfts || !cfts[0].ss)
@@ -2831,13 +2846,14 @@ int cgroup_rm_cftypes(struct cftype *cfts)
 		if (set->cfts == cfts) {
 			list_del(&set->node);
 			kfree(set);
-			cgroup_cfts_commit(cfts, false);
-			return 0;
+			found = cfts;
+			break;
 		}
 	}
 
-	cgroup_cfts_commit(NULL, false);
-	return -ENOENT;
+	cgroup_cfts_commit(found, false);
+	cgroup_exit_cftypes(cfts);
+	return found ? 0 : -ENOENT;
 }
 
 /**
@@ -4596,6 +4612,8 @@ int __init cgroup_init(void)
 	if (err)
 		return err;
 
+	cgroup_init_cftypes(NULL, cgroup_base_files);
+
 	for_each_subsys(ss, i) {
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
-- 
cgit v1.3-14-g43fede


From b1664924062393bb048203bd4622e0b1c9e1d328 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: introduce cgroup_ino()

mm/memory-failure.c::hwpoison_filter_task() has been reaching into
cgroup to extract the associated ino to be used as a filtering
criterion.  This is an implementation detail which shouldn't be
depended upon from outside cgroup proper and is about to change with
the scheduled kernfs conversion.

This patch introduces a proper interface to determine the associated
ino, cgroup_ino(), and updates hwpoison_filter_task() to use it
instead of reaching directly into cgroup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/cgroup.h | 9 +++++++++
 kernel/cgroup.c        | 5 ++++-
 mm/memory-failure.c    | 8 ++------
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 63feaed83bc6..06f577764414 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -507,6 +507,15 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 	return rcu_dereference(cgrp->name)->name;
 }
 
+/* returns ino associated with a cgroup, 0 indicates unmounted root */
+static inline ino_t cgroup_ino(struct cgroup *cgrp)
+{
+	if (cgrp->dentry)
+		return cgrp->dentry->d_inode->i_ino;
+	else
+		return 0;
+}
+
 static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	struct cgroup_open_file *of = seq->private;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 42e588ef62d1..11f7a05e791e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -792,7 +792,10 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 	struct inode *inode = new_inode(sb);
 
 	if (inode) {
-		inode->i_ino = get_next_ino();
+		do {
+			/* ino 0 is reserved for dummy_root */
+			inode->i_ino = get_next_ino();
+		} while (!inode->i_ino);
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
 		inode->i_gid = current_fsgid();
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 4f08a2d61487..9b5933c66c16 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p)
 		return -EINVAL;
 
 	css = mem_cgroup_css(mem);
-	/* root_mem_cgroup has NULL dentries */
-	if (!css->cgroup->dentry)
-		return -EINVAL;
-
-	ino = css->cgroup->dentry->d_inode->i_ino;
+	ino = cgroup_ino(css->cgroup);
 	css_put(css);
 
-	if (ino != hwpoison_filter_memcg)
+	if (!ino || ino != hwpoison_filter_memcg)
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.3-14-g43fede


From 59f5296b51b86718dd6eecf0a268b2f1a1ec0a2d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: misc preps for kernfs conversion

* Un-inline seq_css().  After kernfs conversion, the function will
  need to dereference internal data structures.

* Add cgroup_get/put_root() and replace direct super_block->s_active
  manipulatinos with them.  These will be converted to kernfs_root
  refcnting.

* Add cgroup_get/put() and replace dget/put() on cgrp->dentry with
  them.  These will be converted to kernfs refcnting.

* Update current_css_set_cg_links_read() to use cgroup_name() instead
  of reaching into the dentry name.  The end result is the same.

These changes don't make functional differences but will make
transition to kernfs easier.

v2: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  8 ++---
 kernel/cgroup.c        | 85 +++++++++++++++++++++++++++++++-------------------
 2 files changed, 55 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 06f577764414..523277871913 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -516,18 +516,14 @@ static inline ino_t cgroup_ino(struct cgroup *cgrp)
 		return 0;
 }
 
-static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
-{
-	struct cgroup_open_file *of = seq->private;
-	return of->cfe->css;
-}
-
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
 	struct cgroup_open_file *of = seq->private;
 	return of->cfe->type;
 }
 
+struct cgroup_subsys_state *seq_css(struct seq_file *seq);
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 11f7a05e791e..9e9e8fd632d8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -169,6 +169,7 @@ static int need_forkexit_callback __read_mostly;
 
 static struct cftype cgroup_base_files[];
 
+static void cgroup_put(struct cgroup *cgrp);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -204,6 +205,13 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 
+struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+	struct cgroup_open_file *of = seq->private;
+	return of->cfe->css;
+}
+EXPORT_SYMBOL_GPL(seq_css);
+
 /**
  * cgroup_is_descendant - test ancestry
  * @cgrp: the cgroup to be tested
@@ -682,6 +690,16 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+static void cgroup_get_root(struct cgroupfs_root *root)
+{
+	atomic_inc(&root->sb->s_active);
+}
+
+static void cgroup_put_root(struct cgroupfs_root *root)
+{
+	deactivate_super(root->sb);
+}
+
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
@@ -837,18 +855,14 @@ static void cgroup_free_fn(struct work_struct *work)
 	mutex_unlock(&cgroup_mutex);
 
 	/*
-	 * We get a ref to the parent's dentry, and put the ref when
-	 * this cgroup is being freed, so it's guaranteed that the
-	 * parent won't be destroyed before its children.
+	 * We get a ref to the parent, and put the ref when this cgroup is
+	 * being freed, so it's guaranteed that the parent won't be
+	 * destroyed before its children.
 	 */
-	dput(cgrp->parent->dentry);
+	cgroup_put(cgrp->parent);
 
-	/*
-	 * Drop the active superblock reference that we took when we
-	 * created the cgroup. This will free cgrp->root, if we are
-	 * holding the last reference to @sb.
-	 */
-	deactivate_super(cgrp->root->sb);
+	/* put the root reference that we took when we created the cgroup */
+	cgroup_put_root(cgrp->root);
 
 	cgroup_pidlist_destroy_all(cgrp);
 
@@ -866,6 +880,11 @@ static void cgroup_free_rcu(struct rcu_head *head)
 	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 
+static void cgroup_get(struct cgroup *cgrp)
+{
+	dget(cgrp->dentry);
+}
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
 	/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -899,6 +918,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
+static void cgroup_put(struct cgroup *cgrp)
+{
+	dput(cgrp->dentry);
+}
+
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -2724,7 +2748,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->top_cgroup;
 	struct super_block *sb = ss->root->sb;
-	struct dentry *prev = NULL;
+	struct cgroup *prev = NULL;
 	struct inode *inode;
 	struct cgroup_subsys_state *css;
 	u64 update_before;
@@ -2754,9 +2778,10 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 			continue;
 
 		inode = cgrp->dentry->d_inode;
-		dget(cgrp->dentry);
-		dput(prev);
-		prev = cgrp->dentry;
+		cgroup_get(cgrp);
+		if (prev)
+			cgroup_put(prev);
+		prev = cgrp;
 
 		mutex_unlock(&cgroup_tree_mutex);
 		mutex_lock(&inode->i_mutex);
@@ -2768,8 +2793,8 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 			break;
 	}
 	mutex_unlock(&cgroup_tree_mutex);
-	dput(prev);
-	deactivate_super(sb);
+	cgroup_put(prev);
+	cgroup_put_root(ss->root);
 	return ret;
 }
 
@@ -3863,11 +3888,9 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
  */
 static void cgroup_dput(struct cgroup *cgrp)
 {
-	struct super_block *sb = cgrp->root->sb;
-
-	atomic_inc(&sb->s_active);
-	dput(cgrp->dentry);
-	deactivate_super(sb);
+	cgroup_get_root(cgrp->root);
+	cgroup_put(cgrp);
+	cgroup_put_root(cgrp->root);
 }
 
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
@@ -4118,7 +4141,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	if (err)
 		goto err_free;
 
-	dget(cgrp->dentry);
+	cgroup_get(cgrp);
 	css_get(css->parent);
 
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
@@ -4197,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	 * can be done outside cgroup_mutex, since the sb can't
 	 * disappear while someone has an open control file on the
 	 * fs */
-	atomic_inc(&sb->s_active);
+	cgroup_get_root(root);
 
 	init_cgroup_housekeeping(cgrp);
 
@@ -4231,7 +4254,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	root->number_of_cgroups++;
 
 	/* hold a ref to the parent's dentry */
-	dget(parent->dentry);
+	cgroup_get(parent);
 
 	/*
 	 * @cgrp is now fully operational.  If something fails after this
@@ -4261,7 +4284,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
 	/* Release the reference count that we took on the superblock */
-	deactivate_super(sb);
+	cgroup_put_root(root);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
@@ -4493,7 +4516,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 {
 	struct cgroup *parent = cgrp->parent;
-	struct dentry *d = cgrp->dentry;
 
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
@@ -4501,7 +4523,7 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
 
-	dput(d);
+	cgroup_put(cgrp);
 
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
@@ -5161,12 +5183,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
-		const char *name;
+		const char *name = "?";
+
+		if (c != cgroup_dummy_top)
+			name = cgroup_name(c);
 
-		if (c->dentry)
-			name = c->dentry->d_name.name;
-		else
-			name = "?";
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name);
 	}
-- 
cgit v1.3-14-g43fede


From f2e85d574e881ff3c597518c1ab48c86f9109880 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: relocate functions in preparation of kernfs conversion

Relocate cgroup_init/exit_root_id(), cgroup_free_root(),
cgroup_kill_sb() and cgroup_file_name() in preparation of kernfs
conversion.

These are pure relocations to make kernfs conversion easier to follow.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 232 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 117 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9e9e8fd632d8..d8efca44de5f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -170,6 +170,8 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
+static int rebind_subsystems(struct cgroupfs_root *root,
+			     unsigned long added_mask, unsigned removed_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -690,6 +692,42 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
+{
+	int id;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
+			      GFP_KERNEL);
+	if (id < 0)
+		return id;
+
+	root->hierarchy_id = id;
+	return 0;
+}
+
+static void cgroup_exit_root_id(struct cgroupfs_root *root)
+{
+	lockdep_assert_held(&cgroup_mutex);
+
+	if (root->hierarchy_id) {
+		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
+		root->hierarchy_id = 0;
+	}
+}
+
+static void cgroup_free_root(struct cgroupfs_root *root)
+{
+	if (root) {
+		/* hierarhcy ID shoulid already have been released */
+		WARN_ON_ONCE(root->hierarchy_id);
+
+		idr_destroy(&root->cgroup_idr);
+		kfree(root);
+	}
+}
+
 static void cgroup_get_root(struct cgroupfs_root *root)
 {
 	atomic_inc(&root->sb->s_active);
@@ -700,6 +738,59 @@ static void cgroup_put_root(struct cgroupfs_root *root)
 	deactivate_super(root->sb);
 }
 
+static void cgroup_kill_sb(struct super_block *sb)
+{
+	struct cgroupfs_root *root = sb->s_fs_info;
+	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgrp_cset_link *link, *tmp_link;
+	int ret;
+
+	BUG_ON(!root);
+
+	BUG_ON(root->number_of_cgroups != 1);
+	BUG_ON(!list_empty(&cgrp->children));
+
+	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
+	mutex_lock(&cgroup_tree_mutex);
+	mutex_lock(&cgroup_mutex);
+
+	/* Rebind all subsystems back to the default hierarchy */
+	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
+		ret = rebind_subsystems(root, 0, root->subsys_mask);
+		/* Shouldn't be able to fail ... */
+		BUG_ON(ret);
+	}
+
+	/*
+	 * Release all the links from cset_links to this hierarchy's
+	 * root cgroup
+	 */
+	write_lock(&css_set_lock);
+
+	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
+		kfree(link);
+	}
+	write_unlock(&css_set_lock);
+
+	if (!list_empty(&root->root_list)) {
+		list_del(&root->root_list);
+		cgroup_root_count--;
+	}
+
+	cgroup_exit_root_id(root);
+
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
+	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+
+	simple_xattrs_free(&cgrp->xattrs);
+
+	kill_litter_super(sb);
+	cgroup_free_root(root);
+}
+
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
  * called with cgroup_mutex held.
@@ -846,6 +937,32 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 	return buf;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static umode_t cgroup_file_mode(const struct cftype *cft)
+{
+	umode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
+		mode |= S_IRUGO;
+
+	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
+	    cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -1358,31 +1475,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	idr_init(&root->cgroup_idr);
 }
 
-static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
-{
-	int id;
-
-	lockdep_assert_held(&cgroup_mutex);
-
-	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
-			      GFP_KERNEL);
-	if (id < 0)
-		return id;
-
-	root->hierarchy_id = id;
-	return 0;
-}
-
-static void cgroup_exit_root_id(struct cgroupfs_root *root)
-{
-	lockdep_assert_held(&cgroup_mutex);
-
-	if (root->hierarchy_id) {
-		idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
-		root->hierarchy_id = 0;
-	}
-}
-
 static int cgroup_test_super(struct super_block *sb, void *data)
 {
 	struct cgroup_sb_opts *opts = data;
@@ -1435,17 +1527,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	return root;
 }
 
-static void cgroup_free_root(struct cgroupfs_root *root)
-{
-	if (root) {
-		/* hierarhcy ID shoulid already have been released */
-		WARN_ON_ONCE(root->hierarchy_id);
-
-		idr_destroy(&root->cgroup_idr);
-		kfree(root);
-	}
-}
-
 static int cgroup_set_super(struct super_block *sb, void *data)
 {
 	int ret;
@@ -1691,59 +1772,6 @@ out_unlock:
 		return ERR_PTR(ret);
 }
 
-static void cgroup_kill_sb(struct super_block *sb)
-{
-	struct cgroupfs_root *root = sb->s_fs_info;
-	struct cgroup *cgrp = &root->top_cgroup;
-	struct cgrp_cset_link *link, *tmp_link;
-	int ret;
-
-	BUG_ON(!root);
-
-	BUG_ON(root->number_of_cgroups != 1);
-	BUG_ON(!list_empty(&cgrp->children));
-
-	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
-	mutex_lock(&cgroup_tree_mutex);
-	mutex_lock(&cgroup_mutex);
-
-	/* Rebind all subsystems back to the default hierarchy */
-	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
-		ret = rebind_subsystems(root, 0, root->subsys_mask);
-		/* Shouldn't be able to fail ... */
-		BUG_ON(ret);
-	}
-
-	/*
-	 * Release all the links from cset_links to this hierarchy's
-	 * root cgroup
-	 */
-	write_lock(&css_set_lock);
-
-	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
-		list_del(&link->cset_link);
-		list_del(&link->cgrp_link);
-		kfree(link);
-	}
-	write_unlock(&css_set_lock);
-
-	if (!list_empty(&root->root_list)) {
-		list_del(&root->root_list);
-		cgroup_root_count--;
-	}
-
-	cgroup_exit_root_id(root);
-
-	mutex_unlock(&cgroup_mutex);
-	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-
-	simple_xattrs_free(&cgrp->xattrs);
-
-	kill_litter_super(sb);
-	cgroup_free_root(root);
-}
-
 static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
@@ -2619,32 +2647,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode,
 	return 0;
 }
 
-/**
- * cgroup_file_mode - deduce file mode of a control file
- * @cft: the control file in question
- *
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
- */
-static umode_t cgroup_file_mode(const struct cftype *cft)
-{
-	umode_t mode = 0;
-
-	if (cft->mode)
-		return cft->mode;
-
-	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
-		mode |= S_IRUGO;
-
-	if (cft->write_u64 || cft->write_s64 || cft->write_string ||
-	    cft->trigger)
-		mode |= S_IWUSR;
-
-	return mode;
-}
-
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
-- 
cgit v1.3-14-g43fede


From 2bd59d48ebfb3df41ee56938946ca0dd30887312 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Feb 2014 11:52:49 -0500
Subject: cgroup: convert to kernfs

cgroup filesystem code was derived from the original sysfs
implementation which was heavily intertwined with vfs objects and
locking with the goal of re-using the existing vfs infrastructure.
That experiment turned out rather disastrous and sysfs switched, a
long time ago, to distributed filesystem model where a separate
representation is maintained which is queried by vfs.  Unfortunately,
cgroup stuck with the failed experiment all these years and
accumulated even more problems over time.

Locking and object lifetime management being entangled with vfs is
probably the most egregious.  vfs is never designed to be misused like
this and cgroup ends up jumping through various convoluted dancing to
make things work.  Even then, operations across multiple cgroups can't
be done safely as it'll deadlock with rename locking.

Recently, kernfs is separated out from sysfs so that it can be used by
users other than sysfs.  This patch converts cgroup to use kernfs,
which will bring the following benefits.

* Separation from vfs internals.  Locking and object lifetime
  management is contained in cgroup proper making things a lot
  simpler.  This removes significant amount of locking convolutions,
  hairy object lifetime rules and the restriction on multi-cgroup
  operations.

* Can drop a lot of code to implement filesystem interface as most are
  provided by kernfs.

* Proper "severing" semantics, which allows controllers to not worry
  about lingering file accesses after offline.

While the preceding patches did as much as possible to make the
transition less painful, large part of the conversion has to be one
discrete step making this patch rather large.  The rest of the commit
message lists notable changes in different areas.

Overall
-------

* vfs constructs replaced with kernfs ones.  cgroup->dentry w/ ->kn,
  cgroupfs_root->sb w/ ->kf_root.

* All dentry accessors are removed.  Helpers to map from kernfs
  constructs are added.

* All vfs plumbing around dentry, inode and bdi removed.

* cgroup_mount() now directly looks for matching root and then
  proceeds to create a new one if not found.

Synchronization and object lifetime
-----------------------------------

* vfs inode locking removed.  Among other things, this removes the
  need for the convolution in cgroup_cfts_commit().  Future patches
  will further simplify it.

* vfs refcnting replaced with cgroup internal ones.  cgroup->refcnt,
  cgroupfs_root->refcnt added.  cgroup_put_root() now directly puts
  root->refcnt and when it reaches zero proceeds to destroy it thus
  merging cgroup_put_root() and the former cgroup_kill_sb().
  Simliarly, cgroup_put() now directly schedules cgroup_free_rcu()
  when refcnt reaches zero.

* Unlike before, kernfs objects don't hold onto cgroup objects.  When
  cgroup destroys a kernfs node, all existing operations are drained
  and the association is broken immediately.  The same for
  cgroupfs_roots and mounts.

* All operations which come through kernfs guarantee that the
  associated cgroup is and stays valid for the duration of operation;
  however, there are two paths which need to find out the associated
  cgroup from dentry without going through kernfs -
  css_tryget_from_dir() and cgroupstats_build().  For these two,
  kernfs_node->priv is RCU managed so that they can dereference it
  under RCU read lock.

File and directory handling
---------------------------

* File and directory operations converted to kernfs_ops and
  kernfs_syscall_ops.

* xattrs is implicitly supported by kernfs.  No need to worry about it
  from cgroup.  This means that "xattr" mount option is no longer
  necessary.  A future patch will add a deprecated warning message
  when sane_behavior.

* When cftype->max_write_len > PAGE_SIZE, it's necessary to make a
  private copy of one of the kernfs_ops to set its atomic_write_len.
  cftype->kf_ops is added and cgroup_init/exit_cftypes() are updated
  to handle it.

* cftype->lockdep_key added so that kernfs lockdep annotation can be
  per cftype.

* Inidividual file entries and open states are now managed by kernfs.
  No need to worry about them from cgroup.  cfent, cgroup_open_file
  and their friends are removed.

* kernfs_nodes are created deactivated and kernfs_activate()
  invocations added to places where creation of new nodes are
  committed.

* cgroup_rmdir() uses kernfs_[un]break_active_protection() for
  self-removal.

v2: - Li pointed out in an earlier patch that specifying "name="
      during mount without subsystem specification should succeed if
      there's an existing hierarchy with a matching name although it
      should fail with -EINVAL if a new hierarchy should be created.
      Prior to the conversion, this used by handled by deferring
      failure from NULL return from cgroup_root_from_opts(), which was
      necessary because root was being created before checking for
      existing ones.  Note that cgroup_root_from_opts() returned an
      ERR_PTR() value for error conditions which require immediate
      mount failure.

      As we now have separate search and creation steps, deferring
      failure from cgroup_root_from_opts() is no longer necessary.
      cgroup_root_from_opts() is updated to always return ERR_PTR()
      value on failure.

    - The logic to match existing roots is updated so that a mount
      attempt with a matching name but different subsys_mask are
      rejected.  This was handled by a separate matching loop under
      the comment "Check for name clashes with existing mounts" but
      got lost during conversion.  Merge the check into the main
      search loop.

    - Add __rcu __force casting in RCU_INIT_POINTER() in
      cgroup_destroy_locked() to avoid the sparse address space
      warning reported by kbuild test bot.  Maybe we want an explicit
      interface to use kn->priv as RCU protected pointer?

v3: Make CONFIG_CGROUPS select CONFIG_KERNFS.

v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: kbuild test robot fengguang.wu@intel.com>
---
 include/linux/cgroup.h |   52 +--
 init/Kconfig           |    1 +
 kernel/cgroup.c        | 1115 ++++++++++++++++--------------------------------
 3 files changed, 383 insertions(+), 785 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 523277871913..0e45a932b823 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -18,10 +18,10 @@
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
-#include <linux/xattr.h>
 #include <linux/fs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
+#include <linux/kernfs.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -159,16 +159,17 @@ struct cgroup {
 	/* the number of attached css's */
 	int nr_css;
 
+	atomic_t refcnt;
+
 	/*
 	 * We link our 'sibling' struct into our parent's 'children'.
 	 * Our children link their 'sibling' into our 'children'.
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
-	struct list_head files;		/* my files */
 
 	struct cgroup *parent;		/* my parent */
-	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
+	struct kernfs_node *kn;		/* cgroup kernfs entry */
 
 	/*
 	 * Monotonically increasing unique serial number which defines a
@@ -222,9 +223,6 @@ struct cgroup {
 	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
-
-	/* directory xattrs */
-	struct simple_xattrs xattrs;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -291,15 +289,17 @@ enum {
 
 /*
  * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
- * associated with a superblock to form an active hierarchy.  This is
+ * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
 struct cgroupfs_root {
-	struct super_block *sb;
+	struct kernfs_root *kf_root;
 
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
+	atomic_t refcnt;
+
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
@@ -415,6 +415,9 @@ struct cftype {
 	 */
 	struct cgroup_subsys *ss;
 
+	/* kernfs_ops to use, initialized automatically during registration */
+	struct kernfs_ops *kf_ops;
+
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
@@ -460,6 +463,10 @@ struct cftype {
 	 * kick type for multiplexing.
 	 */
 	int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lock_class_key	lockdep_key;
+#endif
 };
 
 /*
@@ -472,26 +479,6 @@ struct cftype_set {
 	struct cftype			*cfts;
 };
 
-/*
- * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.  Don't
- * access directly.
- */
-struct cfent {
-	struct list_head		node;
-	struct dentry			*dentry;
-	struct cftype			*type;
-	struct cgroup_subsys_state	*css;
-
-	/* file xattrs */
-	struct simple_xattrs		xattrs;
-};
-
-/* seq_file->private points to the following, only ->priv is public */
-struct cgroup_open_file {
-	struct cfent			*cfe;
-	void				*priv;
-};
-
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
@@ -510,16 +497,17 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
-	if (cgrp->dentry)
-		return cgrp->dentry->d_inode->i_ino;
+	if (cgrp->kn)
+		return cgrp->kn->ino;
 	else
 		return 0;
 }
 
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
-	struct cgroup_open_file *of = seq->private;
-	return of->cfe->type;
+	struct kernfs_open_file *of = seq->private;
+
+	return of->kn->priv;
 }
 
 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
diff --git a/init/Kconfig b/init/Kconfig
index 009a797dd242..3f74784560a5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -854,6 +854,7 @@ config NUMA_BALANCING
 
 menuconfig CGROUPS
 	boolean "Control Group support"
+	select KERNFS
 	help
 	  This option adds support for grouping sets of processes together, for
 	  use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d8efca44de5f..cda614da40cf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -40,9 +40,7 @@
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
-#include <linux/backing-dev.h>
 #include <linux/slab.h>
-#include <linux/magic.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/sort.h>
@@ -50,7 +48,6 @@
 #include <linux/delayacct.h>
 #include <linux/cgroupstats.h>
 #include <linux/hashtable.h>
-#include <linux/namei.h>
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -176,7 +173,6 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
-static int cgroup_file_release(struct inode *inode, struct file *file);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
 
 /**
@@ -209,8 +205,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 
 struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
-	struct cgroup_open_file *of = seq->private;
-	return of->cfe->css;
+	struct kernfs_open_file *of = seq->private;
+	struct cgroup *cgrp = of->kn->parent->priv;
+	struct cftype *cft = seq_cft(seq);
+
+	/*
+	 * This is open and unprotected implementation of cgroup_css().
+	 * seq_css() is only called from a kernfs file operation which has
+	 * an active reference on the file.  Because all the subsystem
+	 * files are drained before a css is disassociated with a cgroup,
+	 * the matching css from the cgroup's subsys table is guaranteed to
+	 * be and stay valid until the enclosing operation is complete.
+	 */
+	if (cft->ss)
+		return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
+	else
+		return &cgrp->dummy_css;
 }
 EXPORT_SYMBOL_GPL(seq_css);
 
@@ -276,21 +286,6 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_active_root(root)					\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-static inline struct cfent *__d_cfe(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
-	return __d_cfe(dentry)->type;
-}
-
 /**
  * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
  * @cgrp: the cgroup to be checked for liveness
@@ -692,6 +687,13 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+{
+	struct cgroup *top_cgrp = kf_root->kn->priv;
+
+	return top_cgrp->root;
+}
+
 static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
 {
 	int id;
@@ -730,30 +732,37 @@ static void cgroup_free_root(struct cgroupfs_root *root)
 
 static void cgroup_get_root(struct cgroupfs_root *root)
 {
-	atomic_inc(&root->sb->s_active);
+	/*
+	 * The caller must ensure that @root is alive, which can be
+	 * achieved by holding a ref on one of the member cgroups or
+	 * following a registered reference to @root while holding
+	 * cgroup_tree_mutex.
+	 */
+	WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
+	atomic_inc(&root->refcnt);
 }
 
 static void cgroup_put_root(struct cgroupfs_root *root)
 {
-	deactivate_super(root->sb);
-}
-
-static void cgroup_kill_sb(struct super_block *sb)
-{
-	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgrp_cset_link *link, *tmp_link;
 	int ret;
 
-	BUG_ON(!root);
+	/*
+	 * @root's refcnt reaching zero and its deregistration should be
+	 * atomic w.r.t. cgroup_tree_mutex.  This ensures that
+	 * cgroup_get_root() is safe to invoke if @root is registered.
+	 */
+	mutex_lock(&cgroup_tree_mutex);
+	if (!atomic_dec_and_test(&root->refcnt)) {
+		mutex_unlock(&cgroup_tree_mutex);
+		return;
+	}
+	mutex_lock(&cgroup_mutex);
 
 	BUG_ON(root->number_of_cgroups != 1);
 	BUG_ON(!list_empty(&cgrp->children));
 
-	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
-	mutex_lock(&cgroup_tree_mutex);
-	mutex_lock(&cgroup_mutex);
-
 	/* Rebind all subsystems back to the default hierarchy */
 	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
 		ret = rebind_subsystems(root, 0, root->subsys_mask);
@@ -783,11 +792,8 @@ static void cgroup_kill_sb(struct super_block *sb)
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
-
-	simple_xattrs_free(&cgrp->xattrs);
 
-	kill_litter_super(sb);
+	kernfs_destroy_root(root->kf_root);
 	cgroup_free_root(root);
 }
 
@@ -878,42 +884,10 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-/*
- * A couple of forward declarations required, due to cyclic reference loop:
- * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
- * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
- * -> cgroup_mkdir.
- */
-
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
-static const struct inode_operations cgroup_dir_inode_operations;
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
-static struct backing_dev_info cgroup_backing_dev_info = {
-	.name		= "cgroup",
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
-{
-	struct inode *inode = new_inode(sb);
-
-	if (inode) {
-		do {
-			/* ino 0 is reserved for dummy_root */
-			inode->i_ino = get_next_ino();
-		} while (!inode->i_ino);
-		inode->i_mode = mode;
-		inode->i_uid = current_fsuid();
-		inode->i_gid = current_fsgid();
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
-	}
-	return inode;
-}
-
 static struct cgroup_name *cgroup_alloc_name(const char *name_str)
 {
 	struct cgroup_name *name;
@@ -983,8 +957,6 @@ static void cgroup_free_fn(struct work_struct *work)
 
 	cgroup_pidlist_destroy_all(cgrp);
 
-	simple_xattrs_free(&cgrp->xattrs);
-
 	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
@@ -999,81 +971,38 @@ static void cgroup_free_rcu(struct rcu_head *head)
 
 static void cgroup_get(struct cgroup *cgrp)
 {
-	dget(cgrp->dentry);
-}
-
-static void cgroup_diput(struct dentry *dentry, struct inode *inode)
-{
-	/* is dentry a directory ? if so, kfree() associated cgroup */
-	if (S_ISDIR(inode->i_mode)) {
-		struct cgroup *cgrp = dentry->d_fsdata;
-
-		BUG_ON(!(cgroup_is_dead(cgrp)));
-
-		/*
-		 * XXX: cgrp->id is only used to look up css's.  As cgroup
-		 * and css's lifetimes will be decoupled, it should be made
-		 * per-subsystem and moved to css->id so that lookups are
-		 * successful until the target css is released.
-		 */
-		mutex_lock(&cgroup_mutex);
-		idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
-		mutex_unlock(&cgroup_mutex);
-		cgrp->id = -1;
-
-		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
-	} else {
-		struct cfent *cfe = __d_cfe(dentry);
-		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-
-		WARN_ONCE(!list_empty(&cfe->node) &&
-			  cgrp != &cgrp->root->top_cgroup,
-			  "cfe still linked for %s\n", cfe->type->name);
-		simple_xattrs_free(&cfe->xattrs);
-		kfree(cfe);
-	}
-	iput(inode);
+	WARN_ON_ONCE(cgroup_is_dead(cgrp));
+	WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
+	atomic_inc(&cgrp->refcnt);
 }
 
 static void cgroup_put(struct cgroup *cgrp)
 {
-	dput(cgrp->dentry);
-}
+	if (!atomic_dec_and_test(&cgrp->refcnt))
+		return;
+	if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
+		return;
 
-static void remove_dir(struct dentry *d)
-{
-	struct dentry *parent = dget(d->d_parent);
+	/*
+	 * XXX: cgrp->id is only used to look up css's.  As cgroup and
+	 * css's lifetimes will be decoupled, it should be made
+	 * per-subsystem and moved to css->id so that lookups are
+	 * successful until the target css is released.
+	 */
+	mutex_lock(&cgroup_mutex);
+	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+	mutex_unlock(&cgroup_mutex);
+	cgrp->id = -1;
 
-	d_delete(d);
-	simple_rmdir(parent->d_inode, d);
-	dput(parent);
+	call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 }
 
 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 {
-	struct cfent *cfe;
+	char name[CGROUP_FILE_NAME_MAX];
 
-	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_tree_mutex);
-
-	/*
-	 * If we're doing cleanup due to failure of cgroup_create(),
-	 * the corresponding @cfe may not exist.
-	 */
-	list_for_each_entry(cfe, &cgrp->files, node) {
-		struct dentry *d = cfe->dentry;
-
-		if (cft && cfe->type != cft)
-			continue;
-
-		dget(d);
-		d_delete(d);
-		simple_unlink(cgrp->dentry->d_inode, d);
-		list_del_init(&cfe->node);
-		dput(d);
-
-		break;
-	}
+	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
 }
 
 /**
@@ -1096,22 +1025,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	}
 }
 
-/*
- * NOTE : the dentry must have been dget()'ed
- */
-static void cgroup_d_remove_dir(struct dentry *dentry)
-{
-	struct dentry *parent;
-
-	parent = dentry->d_parent;
-	spin_lock(&parent->d_lock);
-	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-	list_del_init(&dentry->d_u.d_child);
-	spin_unlock(&dentry->d_lock);
-	spin_unlock(&parent->d_lock);
-	remove_dir(dentry);
-}
-
 static int rebind_subsystems(struct cgroupfs_root *root,
 			     unsigned long added_mask, unsigned removed_mask)
 {
@@ -1179,13 +1092,15 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 	 * now matches the bound subsystems.
 	 */
 	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
+	kernfs_activate(cgrp->kn);
 
 	return 0;
 }
 
-static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
+static int cgroup_show_options(struct seq_file *seq,
+			       struct kernfs_root *kf_root)
 {
-	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
+	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 
@@ -1219,9 +1134,6 @@ struct cgroup_sb_opts {
 	char *name;
 	/* User explicitly requested empty subsystem */
 	bool none;
-
-	struct cgroupfs_root *new_root;
-
 };
 
 /*
@@ -1380,11 +1292,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	return 0;
 }
 
-static int cgroup_remount(struct super_block *sb, int *flags, char *data)
+static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
-	struct cgroupfs_root *root = sb->s_fs_info;
-	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 
@@ -1393,7 +1304,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
-	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -1439,34 +1349,26 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.name);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
 }
 
-static const struct super_operations cgroup_ops = {
-	.statfs = simple_statfs,
-	.drop_inode = generic_delete_inode,
-	.show_options = cgroup_show_options,
-	.remount_fs = cgroup_remount,
-};
-
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
+	atomic_set(&cgrp->refcnt, 1);
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
-	INIT_LIST_HEAD(&cgrp->files);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
 	cgrp->dummy_css.cgroup = cgrp;
-	simple_xattrs_init(&cgrp->xattrs);
 }
 
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 
+	atomic_set(&root->refcnt, 1);
 	INIT_LIST_HEAD(&root->root_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
@@ -1475,32 +1377,12 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	idr_init(&root->cgroup_idr);
 }
 
-static int cgroup_test_super(struct super_block *sb, void *data)
-{
-	struct cgroup_sb_opts *opts = data;
-	struct cgroupfs_root *root = sb->s_fs_info;
-
-	/* If we asked for a name then it must match */
-	if (opts->name && strcmp(opts->name, root->name))
-		return 0;
-
-	/*
-	 * If we asked for subsystems (or explicitly for no
-	 * subsystems) then they must match
-	 */
-	if ((opts->subsys_mask || opts->none)
-	    && (opts->subsys_mask != root->subsys_mask))
-		return 0;
-
-	return 1;
-}
-
 static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 {
 	struct cgroupfs_root *root;
 
 	if (!opts->subsys_mask && !opts->none)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root)
@@ -1527,99 +1409,21 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	return root;
 }
 
-static int cgroup_set_super(struct super_block *sb, void *data)
-{
-	int ret;
-	struct cgroup_sb_opts *opts = data;
-
-	/* If we don't have a new root, we can't set up a new sb */
-	if (!opts->new_root)
-		return -EINVAL;
-
-	BUG_ON(!opts->subsys_mask && !opts->none);
-
-	ret = set_anon_super(sb, NULL);
-	if (ret)
-		return ret;
-
-	sb->s_fs_info = opts->new_root;
-	opts->new_root->sb = sb;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = CGROUP_SUPER_MAGIC;
-	sb->s_op = &cgroup_ops;
-
-	return 0;
-}
-
-static int cgroup_get_rootdir(struct super_block *sb)
-{
-	static const struct dentry_operations cgroup_dops = {
-		.d_iput = cgroup_diput,
-		.d_delete = always_delete_dentry,
-	};
-
-	struct inode *inode =
-		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
-
-	if (!inode)
-		return -ENOMEM;
-
-	inode->i_fop = &simple_dir_operations;
-	inode->i_op = &cgroup_dir_inode_operations;
-	/* directories start off with i_nlink == 2 (for "." entry) */
-	inc_nlink(inode);
-	sb->s_root = d_make_root(inode);
-	if (!sb->s_root)
-		return -ENOMEM;
-	/* for everything else we want ->d_op set */
-	sb->s_d_op = &cgroup_dops;
-	return 0;
-}
-
 static int cgroup_setup_root(struct cgroupfs_root *root)
 {
 	LIST_HEAD(tmp_links);
-	struct super_block *sb = root->sb;
 	struct cgroup *root_cgrp = &root->top_cgroup;
-	struct cgroupfs_root *existing_root;
 	struct css_set *cset;
-	struct inode *inode;
-	const struct cred *cred;
 	int i, ret;
 
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
-	BUG_ON(sb->s_root != NULL);
-
-	mutex_unlock(&cgroup_mutex);
-	mutex_unlock(&cgroup_tree_mutex);
-
-	ret = cgroup_get_rootdir(sb);
-	if (ret) {
-		mutex_lock(&cgroup_tree_mutex);
-		mutex_lock(&cgroup_mutex);
-		return ret;
-	}
-	inode = sb->s_root->d_inode;
-
-	mutex_lock(&inode->i_mutex);
-	mutex_lock(&cgroup_tree_mutex);
-	mutex_lock(&cgroup_mutex);
 
 	ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
 	if (ret < 0)
-		goto out_unlock;
+		goto out;
 	root_cgrp->id = ret;
 
-	/* check for name clashes with existing mounts */
-	ret = -EBUSY;
-	if (strlen(root->name))
-		for_each_active_root(existing_root)
-			if (!strcmp(existing_root->name, root->name))
-				goto out_unlock;
-
 	/*
 	 * We're accessing css_set_count without locking css_set_lock here,
 	 * but that's OK - it can only be increased by someone holding
@@ -1628,34 +1432,29 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
 	 */
 	ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
 	ret = cgroup_init_root_id(root, 2, 0);
 	if (ret)
-		goto out_unlock;
-
-	sb->s_root->d_fsdata = root_cgrp;
-	root_cgrp->dentry = sb->s_root;
+		goto out;
 
-	/*
-	 * We're inside get_sb() and will call lookup_one_len() to create
-	 * the root files, which doesn't work if SELinux is in use.  The
-	 * following cred dancing somehow works around it.  See 2ce9738ba
-	 * ("cgroupfs: use init_cred when populating new cgroupfs mount")
-	 * for more details.
-	 */
-	cred = override_creds(&init_cred);
+	root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
+					   KERNFS_ROOT_CREATE_DEACTIVATED,
+					   root_cgrp);
+	if (IS_ERR(root->kf_root)) {
+		ret = PTR_ERR(root->kf_root);
+		goto exit_root_id;
+	}
+	root_cgrp->kn = root->kf_root->kn;
 
 	ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
 	if (ret)
-		goto rm_base_files;
+		goto destroy_root;
 
 	ret = rebind_subsystems(root, root->subsys_mask, 0);
 	if (ret)
-		goto rm_base_files;
-
-	revert_creds(cred);
+		goto destroy_root;
 
 	/*
 	 * There must be no failure case after here, since rebinding takes
@@ -1677,15 +1476,16 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
 	BUG_ON(!list_empty(&root_cgrp->children));
 	BUG_ON(root->number_of_cgroups != 1);
 
+	kernfs_activate(root_cgrp->kn);
 	ret = 0;
-	goto out_unlock;
+	goto out;
 
-rm_base_files:
-	cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false);
-	revert_creds(cred);
+destroy_root:
+	kernfs_destroy_root(root->kf_root);
+	root->kf_root = NULL;
+exit_root_id:
 	cgroup_exit_root_id(root);
-out_unlock:
-	mutex_unlock(&inode->i_mutex);
+out:
 	free_cgrp_cset_links(&tmp_links);
 	return ret;
 }
@@ -1694,10 +1494,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
-	struct super_block *sb = NULL;
-	struct cgroupfs_root *root = NULL;
+	struct cgroupfs_root *root;
 	struct cgroup_sb_opts opts;
-	struct cgroupfs_root *new_root;
+	struct dentry *dentry;
 	int ret;
 
 	mutex_lock(&cgroup_tree_mutex);
@@ -1708,41 +1507,32 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * Allocate a new cgroup root. We may not need it if we're
-	 * reusing an existing hierarchy.
-	 */
-	new_root = cgroup_root_from_opts(&opts);
-	if (IS_ERR(new_root)) {
-		ret = PTR_ERR(new_root);
-		goto out_unlock;
-	}
-	opts.new_root = new_root;
+	/* look for a matching existing root */
+	for_each_active_root(root) {
+		bool name_match = false;
 
-	/* Locate an existing or new sb for this hierarchy */
-	mutex_unlock(&cgroup_mutex);
-	mutex_unlock(&cgroup_tree_mutex);
-	sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts);
-	mutex_lock(&cgroup_tree_mutex);
-	mutex_lock(&cgroup_mutex);
-	if (IS_ERR(sb)) {
-		ret = PTR_ERR(sb);
-		cgroup_free_root(opts.new_root);
-		goto out_unlock;
-	}
+		/*
+		 * If we asked for a name then it must match.  Also, if
+		 * name matches but sybsys_mask doesn't, we should fail.
+		 * Remember whether name matched.
+		 */
+		if (opts.name) {
+			if (strcmp(opts.name, root->name))
+				continue;
+			name_match = true;
+		}
 
-	root = sb->s_fs_info;
-	BUG_ON(!root);
-	if (root == opts.new_root) {
-		ret = cgroup_setup_root(root);
-		if (ret)
-			goto out_unlock;
-	} else {
 		/*
-		 * We re-used an existing hierarchy - the new root (if
-		 * any) is not needed
+		 * If we asked for subsystems (or explicitly for no
+		 * subsystems) then they must match.
 		 */
-		cgroup_free_root(opts.new_root);
+		if ((opts.subsys_mask || opts.none) &&
+		    (opts.subsys_mask != root->subsys_mask)) {
+			if (!name_match)
+				continue;
+			ret = -EBUSY;
+			goto out_unlock;
+		}
 
 		if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
 			if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
@@ -1753,23 +1543,45 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 				pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
 			}
 		}
+
+		cgroup_get_root(root);
+		goto out_unlock;
 	}
 
-	ret = 0;
+	/* no such thing, create a new one */
+	root = cgroup_root_from_opts(&opts);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out_unlock;
+	}
+
+	ret = cgroup_setup_root(root);
+	if (ret)
+		cgroup_free_root(root);
+
 out_unlock:
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 
-	if (ret && !IS_ERR_OR_NULL(sb))
-		deactivate_locked_super(sb);
-
 	kfree(opts.release_agent);
 	kfree(opts.name);
 
-	if (!ret)
-		return dget(sb->s_root);
-	else
+	if (ret)
 		return ERR_PTR(ret);
+
+	dentry = kernfs_mount(fs_type, flags, root->kf_root);
+	if (IS_ERR(dentry))
+		cgroup_put_root(root);
+	return dentry;
+}
+
+static void cgroup_kill_sb(struct super_block *sb)
+{
+	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
+	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
+
+	cgroup_put_root(root);
+	kernfs_kill_sb(sb);
 }
 
 static struct file_system_type cgroup_fs_type = {
@@ -2301,29 +2113,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
-static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
-				 size_t nbytes, loff_t *ppos)
+static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
+				 size_t nbytes, loff_t off)
 {
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
-	struct cgroup_subsys_state *css = cfe->css;
-	size_t max_bytes = max(cft->max_write_len, PAGE_SIZE);
-	char *buf;
+	struct cgroup *cgrp = of->kn->parent->priv;
+	struct cftype *cft = of->kn->priv;
+	struct cgroup_subsys_state *css;
 	int ret;
 
-	if (nbytes > max_bytes)
-		return -E2BIG;
-
-	buf = kmalloc(nbytes + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	if (copy_from_user(buf, userbuf, nbytes)) {
-		ret = -EFAULT;
-		goto out_free;
-	}
-
-	buf[nbytes] = '\0';
+	/*
+	 * kernfs guarantees that a file isn't deleted with operations in
+	 * flight, which means that the matching css is and stays alive and
+	 * doesn't need to be pinned.  The RCU locking is not necessary
+	 * either.  It's just for the convenience of using cgroup_css().
+	 */
+	rcu_read_lock();
+	css = cgroup_css(cgrp, cft->ss);
+	rcu_read_unlock();
 
 	if (cft->write_string) {
 		ret = cft->write_string(css, cft, strstrip(buf));
@@ -2342,53 +2148,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf,
 	} else {
 		ret = -EINVAL;
 	}
-out_free:
-	kfree(buf);
+
 	return ret ?: nbytes;
 }
 
-/*
- * seqfile ops/methods for returning structured data. Currently just
- * supports string->u64 maps, but can be extended in future.
- */
-
 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
 {
-	struct cftype *cft = seq_cft(seq);
-
-	if (cft->seq_start) {
-		return cft->seq_start(seq, ppos);
-	} else {
-		/*
-		 * The same behavior and code as single_open().  Returns
-		 * !NULL if pos is at the beginning; otherwise, NULL.
-		 */
-		return NULL + !*ppos;
-	}
+	return seq_cft(seq)->seq_start(seq, ppos);
 }
 
 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
 {
-	struct cftype *cft = seq_cft(seq);
-
-	if (cft->seq_next) {
-		return cft->seq_next(seq, v, ppos);
-	} else {
-		/*
-		 * The same behavior and code as single_open(), always
-		 * terminate after the initial read.
-		 */
-		++*ppos;
-		return NULL;
-	}
+	return seq_cft(seq)->seq_next(seq, v, ppos);
 }
 
 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
 {
-	struct cftype *cft = seq_cft(seq);
-
-	if (cft->seq_stop)
-		cft->seq_stop(seq, v);
+	seq_cft(seq)->seq_stop(seq, v);
 }
 
 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
@@ -2408,96 +2184,36 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
 	return 0;
 }
 
-static struct seq_operations cgroup_seq_operations = {
-	.start		= cgroup_seqfile_start,
-	.next		= cgroup_seqfile_next,
-	.stop		= cgroup_seqfile_stop,
-	.show		= cgroup_seqfile_show,
+static struct kernfs_ops cgroup_kf_single_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.write			= cgroup_file_write,
+	.seq_show		= cgroup_seqfile_show,
 };
 
-static int cgroup_file_open(struct inode *inode, struct file *file)
-{
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cftype *cft = __d_cft(file->f_dentry);
-	struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent);
-	struct cgroup_subsys_state *css;
-	struct cgroup_open_file *of;
-	int err;
-
-	err = generic_file_open(inode, file);
-	if (err)
-		return err;
-
-	/*
-	 * If the file belongs to a subsystem, pin the css.  Will be
-	 * unpinned either on open failure or release.  This ensures that
-	 * @css stays alive for all file operations.
-	 */
-	rcu_read_lock();
-	css = cgroup_css(cgrp, cft->ss);
-	if (cft->ss && !css_tryget(css))
-		css = NULL;
-	rcu_read_unlock();
-
-	if (!css)
-		return -ENODEV;
-
-	/*
-	 * @cfe->css is used by read/write/close to determine the
-	 * associated css.  @file->private_data would be a better place but
-	 * that's already used by seqfile.  Multiple accessors may use it
-	 * simultaneously which is okay as the association never changes.
-	 */
-	WARN_ON_ONCE(cfe->css && cfe->css != css);
-	cfe->css = css;
-
-	of = __seq_open_private(file, &cgroup_seq_operations,
-				sizeof(struct cgroup_open_file));
-	if (of) {
-		of->cfe = cfe;
-		return 0;
-	}
-
-	if (css->ss)
-		css_put(css);
-	return -ENOMEM;
-}
-
-static int cgroup_file_release(struct inode *inode, struct file *file)
-{
-	struct cfent *cfe = __d_cfe(file->f_dentry);
-	struct cgroup_subsys_state *css = cfe->css;
-
-	if (css->ss)
-		css_put(css);
-	return seq_release_private(inode, file);
-}
+static struct kernfs_ops cgroup_kf_ops = {
+	.atomic_write_len	= PAGE_SIZE,
+	.write			= cgroup_file_write,
+	.seq_start		= cgroup_seqfile_start,
+	.seq_next		= cgroup_seqfile_next,
+	.seq_stop		= cgroup_seqfile_stop,
+	.seq_show		= cgroup_seqfile_show,
+};
 
 /*
  * cgroup_rename - Only allow simple rename of directories in place.
  */
-static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
-			    struct inode *new_dir, struct dentry *new_dentry)
+static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
+			 const char *new_name_str)
 {
-	int ret;
+	struct cgroup *cgrp = kn->priv;
 	struct cgroup_name *name, *old_name;
-	struct cgroup *cgrp;
-
-	/*
-	 * It's convinient to use parent dir's i_mutex to protected
-	 * cgrp->name.
-	 */
-	lockdep_assert_held(&old_dir->i_mutex);
+	int ret;
 
-	if (!S_ISDIR(old_dentry->d_inode->i_mode))
+	if (kernfs_type(kn) != KERNFS_DIR)
 		return -ENOTDIR;
-	if (new_dentry->d_inode)
-		return -EEXIST;
-	if (old_dir != new_dir)
+	if (kn->parent != new_parent)
 		return -EIO;
 
-	cgrp = __d_cgrp(old_dentry);
-
 	/*
 	 * This isn't a proper migration and its usefulness is very
 	 * limited.  Disallow if sane_behavior.
@@ -2505,186 +2221,43 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 
-	name = cgroup_alloc_name(new_dentry->d_name.name);
+	name = cgroup_alloc_name(new_name_str);
 	if (!name)
 		return -ENOMEM;
 
-	ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
-	if (ret) {
-		kfree(name);
-		return ret;
+	mutex_lock(&cgroup_tree_mutex);
+	mutex_lock(&cgroup_mutex);
+
+	ret = kernfs_rename(kn, new_parent, new_name_str);
+	if (!ret) {
+		old_name = rcu_dereference_protected(cgrp->name, true);
+		rcu_assign_pointer(cgrp->name, name);
+	} else {
+		old_name = name;
 	}
 
-	old_name = rcu_dereference_protected(cgrp->name, true);
-	rcu_assign_pointer(cgrp->name, name);
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 
 	kfree_rcu(old_name, rcu_head);
-	return 0;
-}
-
-static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
-{
-	if (S_ISDIR(dentry->d_inode->i_mode))
-		return &__d_cgrp(dentry)->xattrs;
-	else
-		return &__d_cfe(dentry)->xattrs;
-}
-
-static inline int xattr_enabled(struct dentry *dentry)
-{
-	struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
-	return root->flags & CGRP_ROOT_XATTR;
-}
-
-static bool is_valid_xattr(const char *name)
-{
-	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
-	    !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
-		return true;
-	return false;
-}
-
-static int cgroup_setxattr(struct dentry *dentry, const char *name,
-			   const void *val, size_t size, int flags)
-{
-	if (!xattr_enabled(dentry))
-		return -EOPNOTSUPP;
-	if (!is_valid_xattr(name))
-		return -EINVAL;
-	return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags);
-}
-
-static int cgroup_removexattr(struct dentry *dentry, const char *name)
-{
-	if (!xattr_enabled(dentry))
-		return -EOPNOTSUPP;
-	if (!is_valid_xattr(name))
-		return -EINVAL;
-	return simple_xattr_remove(__d_xattrs(dentry), name);
-}
-
-static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name,
-			       void *buf, size_t size)
-{
-	if (!xattr_enabled(dentry))
-		return -EOPNOTSUPP;
-	if (!is_valid_xattr(name))
-		return -EINVAL;
-	return simple_xattr_get(__d_xattrs(dentry), name, buf, size);
-}
-
-static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size)
-{
-	if (!xattr_enabled(dentry))
-		return -EOPNOTSUPP;
-	return simple_xattr_list(__d_xattrs(dentry), buf, size);
-}
-
-static const struct file_operations cgroup_file_operations = {
-	.read = seq_read,
-	.write = cgroup_file_write,
-	.llseek = generic_file_llseek,
-	.open = cgroup_file_open,
-	.release = cgroup_file_release,
-};
-
-static const struct inode_operations cgroup_file_inode_operations = {
-	.setxattr = cgroup_setxattr,
-	.getxattr = cgroup_getxattr,
-	.listxattr = cgroup_listxattr,
-	.removexattr = cgroup_removexattr,
-};
-
-static const struct inode_operations cgroup_dir_inode_operations = {
-	.lookup = simple_lookup,
-	.mkdir = cgroup_mkdir,
-	.rmdir = cgroup_rmdir,
-	.rename = cgroup_rename,
-	.setxattr = cgroup_setxattr,
-	.getxattr = cgroup_getxattr,
-	.listxattr = cgroup_listxattr,
-	.removexattr = cgroup_removexattr,
-};
-
-static int cgroup_create_file(struct dentry *dentry, umode_t mode,
-				struct super_block *sb)
-{
-	struct inode *inode;
-
-	if (!dentry)
-		return -ENOENT;
-	if (dentry->d_inode)
-		return -EEXIST;
-
-	inode = cgroup_new_inode(mode, sb);
-	if (!inode)
-		return -ENOMEM;
-
-	if (S_ISDIR(mode)) {
-		inode->i_op = &cgroup_dir_inode_operations;
-		inode->i_fop = &simple_dir_operations;
-
-		/* start off with i_nlink == 2 (for "." entry) */
-		inc_nlink(inode);
-		inc_nlink(dentry->d_parent->d_inode);
-
-		/*
-		 * Control reaches here with cgroup_mutex held.
-		 * @inode->i_mutex should nest outside cgroup_mutex but we
-		 * want to populate it immediately without releasing
-		 * cgroup_mutex.  As @inode isn't visible to anyone else
-		 * yet, trylock will always succeed without affecting
-		 * lockdep checks.
-		 */
-		WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex));
-	} else if (S_ISREG(mode)) {
-		inode->i_size = 0;
-		inode->i_fop = &cgroup_file_operations;
-		inode->i_op = &cgroup_file_inode_operations;
-	}
-	d_instantiate(dentry, inode);
-	dget(dentry);	/* Extra count - pin the dentry in core */
-	return 0;
+	return ret;
 }
 
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
-	struct dentry *dir = cgrp->dentry;
-	struct cgroup *parent = __d_cgrp(dir);
-	struct dentry *dentry;
-	struct cfent *cfe;
-	int error;
-	umode_t mode;
 	char name[CGROUP_FILE_NAME_MAX];
+	struct kernfs_node *kn;
+	struct lock_class_key *key = NULL;
 
-	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
-
-	cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
-	if (!cfe)
-		return -ENOMEM;
-
-	cgroup_file_name(cgrp, cft, name);
-	dentry = lookup_one_len(name, dir, strlen(name));
-	if (IS_ERR(dentry)) {
-		error = PTR_ERR(dentry);
-		goto out;
-	}
-
-	cfe->type = (void *)cft;
-	cfe->dentry = dentry;
-	dentry->d_fsdata = cfe;
-	simple_xattrs_init(&cfe->xattrs);
-
-	mode = cgroup_file_mode(cft);
-	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
-	if (!error) {
-		list_add_tail(&cfe->node, &parent->files);
-		cfe = NULL;
-	}
-	dput(dentry);
-out:
-	kfree(cfe);
-	return error;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	key = &cft->lockdep_key;
+#endif
+	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
+				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
+				  NULL, false, key);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+	return 0;
 }
 
 /**
@@ -2704,7 +2277,6 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 	struct cftype *cft;
 	int ret;
 
-	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_tree_mutex);
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
@@ -2749,9 +2321,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->top_cgroup;
-	struct super_block *sb = ss->root->sb;
 	struct cgroup *prev = NULL;
-	struct inode *inode;
 	struct cgroup_subsys_state *css;
 	u64 update_before;
 	int ret = 0;
@@ -2759,12 +2329,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 	mutex_unlock(&cgroup_mutex);
 
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-	if (!cfts || ss->root == &cgroup_dummy_root ||
-	    !atomic_inc_not_zero(&sb->s_active)) {
+	if (!cfts || ss->root == &cgroup_dummy_root) {
 		mutex_unlock(&cgroup_tree_mutex);
 		return 0;
 	}
 
+	cgroup_get_root(ss->root);
+
 	/*
 	 * All cgroups which are created after we drop cgroup_mutex will
 	 * have the updated set of files, so we only need to update the
@@ -2779,18 +2350,16 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 		if (cgroup_is_dead(cgrp))
 			continue;
 
-		inode = cgrp->dentry->d_inode;
 		cgroup_get(cgrp);
 		if (prev)
 			cgroup_put(prev);
 		prev = cgrp;
 
-		mutex_unlock(&cgroup_tree_mutex);
-		mutex_lock(&inode->i_mutex);
-		mutex_lock(&cgroup_tree_mutex);
-		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
+		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) {
 			ret = cgroup_addrm_files(cgrp, cfts, is_add);
-		mutex_unlock(&inode->i_mutex);
+			if (is_add)
+				kernfs_activate(cgrp->kn);
+		}
 		if (ret)
 			break;
 	}
@@ -2804,16 +2373,45 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
 {
 	struct cftype *cft;
 
-	for (cft = cfts; cft->name[0] != '\0'; cft++)
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		/* free copy for custom atomic_write_len, see init_cftypes() */
+		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
+			kfree(cft->kf_ops);
+		cft->kf_ops = NULL;
 		cft->ss = NULL;
+	}
 }
 
-static void cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 
-	for (cft = cfts; cft->name[0] != '\0'; cft++)
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		struct kernfs_ops *kf_ops;
+
+		if (cft->seq_start)
+			kf_ops = &cgroup_kf_ops;
+		else
+			kf_ops = &cgroup_kf_single_ops;
+
+		/*
+		 * Ugh... if @cft wants a custom max_write_len, we need to
+		 * make a copy of kf_ops to set its atomic_write_len.
+		 */
+		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
+			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
+			if (!kf_ops) {
+				cgroup_exit_cftypes(cfts);
+				return -ENOMEM;
+			}
+			kf_ops->atomic_write_len = cft->max_write_len;
+		}
+
+		cft->kf_ops = kf_ops;
 		cft->ss = ss;
+	}
+
+	return 0;
 }
 
 /**
@@ -2839,7 +2437,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	if (!set)
 		return -ENOMEM;
 
-	cgroup_init_cftypes(ss, cfts);
+	ret = cgroup_init_cftypes(ss, cfts);
+	if (ret)
+		return ret;
 
 	cgroup_cfts_prepare();
 	set->cfts = cfts;
@@ -3706,21 +3306,27 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
  */
 int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 {
-	int ret = -EINVAL;
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
 	struct cgroup *cgrp;
 	struct css_task_iter it;
 	struct task_struct *tsk;
 
+	/* it should be kernfs_node belonging to cgroupfs and is a directory */
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
+		return -EINVAL;
+
 	/*
-	 * Validate dentry by checking the superblock operations,
-	 * and make sure it's a directory.
+	 * We aren't being called from kernfs and there's no guarantee on
+	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
+	 * @kn->priv is RCU safe.  Let's do the RCU dancing.
 	 */
-	if (dentry->d_sb->s_op != &cgroup_ops ||
-	    !S_ISDIR(dentry->d_inode->i_mode))
-		 goto err;
-
-	ret = 0;
-	cgrp = dentry->d_fsdata;
+	rcu_read_lock();
+	cgrp = rcu_dereference(kn->priv);
+	if (!cgrp) {
+		rcu_read_unlock();
+		return -ENOENT;
+	}
 
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
@@ -3745,8 +3351,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	}
 	css_task_iter_end(&it);
 
-err:
-	return ret;
+	rcu_read_unlock();
+	return 0;
 }
 
 
@@ -3764,7 +3370,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 	 * after a seek to the start). Use a binary-search to find the
 	 * next pid to display, if any
 	 */
-	struct cgroup_open_file *of = s->private;
+	struct kernfs_open_file *of = s->private;
 	struct cgroup *cgrp = seq_css(s)->cgroup;
 	struct cgroup_pidlist *l;
 	enum cgroup_filetype type = seq_cft(s)->private;
@@ -3819,7 +3425,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
 
 static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 {
-	struct cgroup_open_file *of = s->private;
+	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 
 	if (l)
@@ -3830,7 +3436,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
 
 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
 {
-	struct cgroup_open_file *of = s->private;
+	struct kernfs_open_file *of = s->private;
 	struct cgroup_pidlist *l = of->priv;
 	pid_t *p = v;
 	pid_t *end = l->list + l->length;
@@ -3880,21 +3486,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-/*
- * When dput() is called asynchronously, if umount has been done and
- * then deactivate_super() in cgroup_free_fn() kills the superblock,
- * there's a small window that vfs will see the root dentry with non-zero
- * refcnt and trigger BUG().
- *
- * That's why we hold a reference before dput() and drop it right after.
- */
-static void cgroup_dput(struct cgroup *cgrp)
-{
-	cgroup_get_root(cgrp->root);
-	cgroup_put(cgrp);
-	cgroup_put_root(cgrp->root);
-}
-
 static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
@@ -4029,7 +3620,7 @@ static void css_free_work_fn(struct work_struct *work)
 		css_put(css->parent);
 
 	css->ss->css_free(css);
-	cgroup_dput(cgrp);
+	cgroup_put(cgrp);
 }
 
 static void css_free_rcu_fn(struct rcu_head *rcu_head)
@@ -4037,10 +3628,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head)
 	struct cgroup_subsys_state *css =
 		container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
 
-	/*
-	 * css holds an extra ref to @cgrp->dentry which is put on the last
-	 * css_put().  dput() requires process context which we don't have.
-	 */
 	INIT_WORK(&css->destroy_work, css_free_work_fn);
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
@@ -4122,7 +3709,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	struct cgroup_subsys_state *css;
 	int err;
 
-	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
 	css = ss->css_alloc(cgroup_css(parent, ss));
@@ -4163,30 +3749,28 @@ err_free:
 	return err;
 }
 
-/*
+/**
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
- * @dentry: dentry of the new cgroup
- * @mode: mode to set on new inode
- *
- * Must be called with the mutex on the parent inode held
+ * @name_str: name of the new cgroup
+ * @mode: mode to set on new cgroup
  */
-static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     umode_t mode)
+static long cgroup_create(struct cgroup *parent, const char *name_str,
+			  umode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
-	struct super_block *sb = root->sb;
+	struct kernfs_node *kn;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
 		return -ENOMEM;
 
-	name = cgroup_alloc_name(dentry->d_name.name);
+	name = cgroup_alloc_name(name_str);
 	if (!name) {
 		err = -ENOMEM;
 		goto err_free_cgrp;
@@ -4217,18 +3801,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		goto err_unlock;
 	}
 
-	/* Grab a reference on the superblock so the hierarchy doesn't
-	 * get deleted on unmount if there are child cgroups.  This
-	 * can be done outside cgroup_mutex, since the sb can't
-	 * disappear while someone has an open control file on the
-	 * fs */
-	cgroup_get_root(root);
-
 	init_cgroup_housekeeping(cgrp);
 
-	dentry->d_fsdata = cgrp;
-	cgrp->dentry = dentry;
-
 	cgrp->parent = parent;
 	cgrp->dummy_css.parent = &parent->dummy_css;
 	cgrp->root = parent->root;
@@ -4239,15 +3813,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 
-	/*
-	 * Create directory.  cgroup_create_file() returns with the new
-	 * directory locked on success so that it can be populated without
-	 * dropping cgroup_mutex.
-	 */
-	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
-	if (err < 0)
+	/* create the directory */
+	kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp);
+	if (IS_ERR(kn)) {
+		err = PTR_ERR(kn);
 		goto err_free_id;
-	lockdep_assert_held(&dentry->d_inode->i_mutex);
+	}
+	cgrp->kn = kn;
 
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 
@@ -4255,7 +3827,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
 
-	/* hold a ref to the parent's dentry */
+	/*
+	 * Grab a reference on the root and parent so that they don't get
+	 * deleted while there are child cgroups.
+	 */
+	cgroup_get_root(root);
 	cgroup_get(parent);
 
 	/*
@@ -4277,16 +3853,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		}
 	}
 
+	kernfs_activate(kn);
+
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 
 	return 0;
 
 err_free_id:
 	idr_remove(&root->cgroup_idr, cgrp->id);
-	/* Release the reference count that we took on the superblock */
-	cgroup_put_root(root);
 err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
@@ -4300,16 +3875,15 @@ err_destroy:
 	cgroup_destroy_locked(cgrp);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-	mutex_unlock(&dentry->d_inode->i_mutex);
 	return err;
 }
 
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+			umode_t mode)
 {
-	struct cgroup *c_parent = dentry->d_parent->d_fsdata;
+	struct cgroup *parent = parent_kn->priv;
 
-	/* the vfs holds inode->i_mutex already */
-	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
+	return cgroup_create(parent, name, mode);
 }
 
 /*
@@ -4373,6 +3947,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
  */
 static void kill_css(struct cgroup_subsys_state *css)
 {
+	/*
+	 * This must happen before css is disassociated with its cgroup.
+	 * See seq_css() for details.
+	 */
 	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
 
 	/*
@@ -4421,13 +3999,12 @@ static void kill_css(struct cgroup_subsys_state *css)
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
-	struct dentry *d = cgrp->dentry;
-	struct cgroup_subsys_state *css;
 	struct cgroup *child;
+	struct cgroup_subsys_state *css;
+	struct kernfs_node *kn;
 	bool empty;
 	int ssid;
 
-	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -4492,15 +4069,24 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	if (!cgrp->nr_css)
 		cgroup_destroy_css_killed(cgrp);
 
+	/* remove @cgrp directory along with the base files */
+	mutex_unlock(&cgroup_mutex);
+
 	/*
-	 * Clear the base files and remove @cgrp directory.  The removal
-	 * puts the base ref but we aren't quite done with @cgrp yet, so
-	 * hold onto it.
+	 * There are two control paths which try to determine cgroup from
+	 * dentry without going through kernfs - cgroupstats_build() and
+	 * css_tryget_from_dir().  Those are supported by RCU protecting
+	 * clearing of cgrp->kn->priv backpointer, which should happen
+	 * after all files under it have been removed.
 	 */
-	mutex_unlock(&cgroup_mutex);
-	cgroup_addrm_files(cgrp, cgroup_base_files, false);
-	dget(d);
-	cgroup_d_remove_dir(d);
+	kn = cgrp->kn;
+	kernfs_get(kn);
+
+	kernfs_remove(cgrp->kn);
+
+	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
+	kernfs_put(kn);
+
 	mutex_lock(&cgroup_mutex);
 
 	return 0;
@@ -4531,19 +4117,46 @@ static void cgroup_destroy_css_killed(struct cgroup *cgrp)
 	check_for_release(parent);
 }
 
-static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
+static int cgroup_rmdir(struct kernfs_node *kn)
 {
-	int ret;
+	struct cgroup *cgrp = kn->priv;
+	int ret = 0;
+
+	/*
+	 * This is self-destruction but @kn can't be removed while this
+	 * callback is in progress.  Let's break active protection.  Once
+	 * the protection is broken, @cgrp can be destroyed at any point.
+	 * Pin it so that it stays accessible.
+	 */
+	cgroup_get(cgrp);
+	kernfs_break_active_protection(kn);
 
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
-	ret = cgroup_destroy_locked(dentry->d_fsdata);
+
+	/*
+	 * @cgrp might already have been destroyed while we're trying to
+	 * grab the mutexes.
+	 */
+	if (!cgroup_is_dead(cgrp))
+		ret = cgroup_destroy_locked(cgrp);
+
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 
+	kernfs_unbreak_active_protection(kn);
+	cgroup_put(cgrp);
 	return ret;
 }
 
+static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+	.remount_fs		= cgroup_remount,
+	.show_options		= cgroup_show_options,
+	.mkdir			= cgroup_mkdir,
+	.rmdir			= cgroup_rmdir,
+	.rename			= cgroup_rename,
+};
+
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
@@ -4635,11 +4248,7 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int i, err;
 
-	err = bdi_init(&cgroup_backing_dev_info);
-	if (err)
-		return err;
-
-	cgroup_init_cftypes(NULL, cgroup_base_files);
+	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 
 	for_each_subsys(ss, i) {
 		if (!ss->early_init)
@@ -4669,24 +4278,17 @@ int __init cgroup_init(void)
 	mutex_unlock(&cgroup_mutex);
 
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
-	if (!cgroup_kobj) {
-		err = -ENOMEM;
-		goto out;
-	}
+	if (!cgroup_kobj)
+		return -ENOMEM;
 
 	err = register_filesystem(&cgroup_fs_type);
 	if (err < 0) {
 		kobject_put(cgroup_kobj);
-		goto out;
+		return err;
 	}
 
 	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
-
-out:
-	if (err)
-		bdi_destroy(&cgroup_backing_dev_info);
-
-	return err;
+	return 0;
 }
 
 static int __init cgroup_wq_init(void)
@@ -5095,18 +4697,25 @@ __setup("cgroup_disable=", cgroup_disable);
 struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
 						struct cgroup_subsys *ss)
 {
+	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
+	struct cgroup_subsys_state *css = NULL;
 	struct cgroup *cgrp;
-	struct cgroup_subsys_state *css;
 
 	/* is @dentry a cgroup dir? */
-	if (!dentry->d_inode ||
-	    dentry->d_inode->i_op != &cgroup_dir_inode_operations)
+	if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
+	    kernfs_type(kn) != KERNFS_DIR)
 		return ERR_PTR(-EBADF);
 
 	rcu_read_lock();
 
-	cgrp = __d_cgrp(dentry);
-	css = cgroup_css(cgrp, ss);
+	/*
+	 * This path doesn't originate from kernfs and @kn could already
+	 * have been or be removed at any point.  @kn->priv is RCU
+	 * protected for this access.  See destroy_locked() for details.
+	 */
+	cgrp = rcu_dereference(kn->priv);
+	if (cgrp)
+		css = cgroup_css(cgrp, ss);
 
 	if (!css || !css_tryget(css))
 		css = ERR_PTR(-ENOENT);
-- 
cgit v1.3-14-g43fede


From d651aa1d68a2f0a7ee65697b04c6a92f8c0a12f2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 11 Feb 2014 13:38:54 -0500
Subject: ring-buffer: Fix first commit on sub-buffer having non-zero delta

Each sub-buffer (buffer page) has a full 64 bit timestamp. The events on
that page use a 27 bit delta against that timestamp in order to save on
bits written to the ring buffer. If the time between events is larger than
what the 27 bits can hold, a "time extend" event is added to hold the
entire 64 bit timestamp again and the events after that hold a delta from
that timestamp.

As a "time extend" is always paired with an event, it is logical to just
allocate the event with the time extend, to make things a bit more efficient.

Unfortunately, when the pairing code was written, it removed the "delta = 0"
from the first commit on a page, causing the events on the page to be
slightly skewed.

Fixes: 69d1b839f7ee "ring-buffer: Bind time extend and data events together"
Cc: stable@vger.kernel.org # 2.6.37+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 294b8a271a04..fc4da2d97f9b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2397,6 +2397,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	write &= RB_WRITE_MASK;
 	tail = write - length;
 
+	/*
+	 * If this is the first commit on the page, then it has the same
+	 * timestamp as the page itself.
+	 */
+	if (!tail)
+		delta = 0;
+
 	/* See if we shot pass the end of this buffer page */
 	if (unlikely(write > BUF_PAGE_SIZE))
 		return rb_move_tail(cpu_buffer, length, tail,
-- 
cgit v1.3-14-g43fede


From 86bf4b68759141459864ebd36ac3038a9cda895b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:48 -0500
Subject: cgroup: warn if "xattr" is specified with "sane_behavior"

Mount option "xattr" is no longer necessary as it's enabled by default
on kernfs.  Warn if "xattr" is specified with "sane_behavior" so that
the option can be removed in the future.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 ++
 kernel/cgroup.c        | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0e45a932b823..305e94ee17f5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -264,6 +264,8 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
+	 * - "xattr" mount option is deprecated.  kernfs always enables it.
+	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cda614da40cf..a0fab71f200f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1267,6 +1267,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
 			return -EINVAL;
 		}
+
+		if (opts->flags & CGRP_ROOT_XATTR)
+			pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n");
 	}
 
 	/*
-- 
cgit v1.3-14-g43fede


From 80b13586997d8e584caa772bd99e2a3e55ac6abe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:48 -0500
Subject: cgroup: relocate cgroup_rm_cftypes()

cftype handling is about to be revamped.  Relocate cgroup_rm_cftypes()
above cgroup_add_cftypes() in preparation.  This is pure relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 70 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a0fab71f200f..a2cbd1549995 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2417,6 +2417,41 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	return 0;
 }
 
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts.  Files described by @cfts are removed from all
+ * existing cgroups and all future cgroups won't have them either.  This
+ * function can be called anytime whether @cfts' subsys is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered.
+ */
+int cgroup_rm_cftypes(struct cftype *cfts)
+{
+	struct cftype *found = NULL;
+	struct cftype_set *set;
+
+	if (!cfts || !cfts[0].ss)
+		return -ENOENT;
+
+	cgroup_cfts_prepare();
+
+	list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
+		if (set->cfts == cfts) {
+			list_del(&set->node);
+			kfree(set);
+			found = cfts;
+			break;
+		}
+	}
+
+	cgroup_cfts_commit(found, false);
+	cgroup_exit_cftypes(cfts);
+	return found ? 0 : -ENOENT;
+}
+
 /**
  * cgroup_add_cftypes - add an array of cftypes to a subsystem
  * @ss: target cgroup subsystem
@@ -2454,41 +2489,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 }
 EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 
-/**
- * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
- * @cfts: zero-length name terminated array of cftypes
- *
- * Unregister @cfts.  Files described by @cfts are removed from all
- * existing cgroups and all future cgroups won't have them either.  This
- * function can be called anytime whether @cfts' subsys is attached or not.
- *
- * Returns 0 on successful unregistration, -ENOENT if @cfts is not
- * registered.
- */
-int cgroup_rm_cftypes(struct cftype *cfts)
-{
-	struct cftype *found = NULL;
-	struct cftype_set *set;
-
-	if (!cfts || !cfts[0].ss)
-		return -ENOENT;
-
-	cgroup_cfts_prepare();
-
-	list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
-		if (set->cfts == cfts) {
-			list_del(&set->node);
-			kfree(set);
-			found = cfts;
-			break;
-		}
-	}
-
-	cgroup_cfts_commit(found, false);
-	cgroup_exit_cftypes(cfts);
-	return found ? 0 : -ENOENT;
-}
-
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
-- 
cgit v1.3-14-g43fede


From 0adb070426dde2fd0b84e7f4f5cefcd8f0b24410 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:48 -0500
Subject: cgroup: remove cftype_set

cftype_set was added primarily to allow registering the same cftype
array more than once for different subsystems.  Nobody uses or needs
such thing and it's already broken because each cftype has ->ss
pointer which is initialized during registration.

Let's add list_head ->node to cftype and use the first cftype entry in
the array to link them instead of allocating separate cftype_set.
While at it, trigger WARN if cft seems previously initialized during
registration.

This simplifies cftype handling a bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 26 +++++++++-----------------
 kernel/cgroup.c        | 41 +++++++++++++----------------------------
 2 files changed, 22 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 305e94ee17f5..b42251a23129 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -412,12 +412,11 @@ struct cftype {
 	unsigned int flags;
 
 	/*
-	 * The subsys this file belongs to.  Initialized automatically
-	 * during registration.  NULL for cgroup core files.
+	 * Fields used for internal bookkeeping.  Initialized automatically
+	 * during registration.
 	 */
-	struct cgroup_subsys *ss;
-
-	/* kernfs_ops to use, initialized automatically during registration */
+	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
+	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 
 	/*
@@ -471,16 +470,6 @@ struct cftype {
 #endif
 };
 
-/*
- * cftype_sets describe cftypes belonging to a subsystem and are chained at
- * cgroup_subsys->cftsets.  Each cftset points to an array of cftypes
- * terminated by zero length name.
- */
-struct cftype_set {
-	struct list_head		node;	/* chained at subsys->cftsets */
-	struct cftype			*cfts;
-};
-
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
@@ -597,8 +586,11 @@ struct cgroup_subsys {
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroupfs_root *root;
 
-	/* list of cftype_sets */
-	struct list_head cftsets;
+	/*
+	 * List of cftypes.  Each entry is the first entry of an array
+	 * terminated by zero length name.
+	 */
+	struct list_head cfts;
 
 	/* base cftypes, automatically registered with subsys itself */
 	struct cftype *base_cftypes;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a2cbd1549995..506ebd61d1c2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1016,12 +1016,12 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	int i;
 
 	for_each_subsys(ss, i) {
-		struct cftype_set *set;
+		struct cftype *cfts;
 
 		if (!test_bit(i, &subsys_mask))
 			continue;
-		list_for_each_entry(set, &ss->cftsets, node)
-			cgroup_addrm_files(cgrp, set->cfts, false);
+		list_for_each_entry(cfts, &ss->cfts, node)
+			cgroup_addrm_files(cgrp, cfts, false);
 	}
 }
 
@@ -2392,6 +2392,8 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		struct kernfs_ops *kf_ops;
 
+		WARN_ON(cft->ss || cft->kf_ops);
+
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
@@ -2430,26 +2432,15 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
-	struct cftype *found = NULL;
-	struct cftype_set *set;
-
 	if (!cfts || !cfts[0].ss)
 		return -ENOENT;
 
 	cgroup_cfts_prepare();
+	list_del(&cfts->node);
+	cgroup_cfts_commit(cfts, false);
 
-	list_for_each_entry(set, &cfts[0].ss->cftsets, node) {
-		if (set->cfts == cfts) {
-			list_del(&set->node);
-			kfree(set);
-			found = cfts;
-			break;
-		}
-	}
-
-	cgroup_cfts_commit(found, false);
 	cgroup_exit_cftypes(cfts);
-	return found ? 0 : -ENOENT;
+	return 0;
 }
 
 /**
@@ -2468,20 +2459,14 @@ int cgroup_rm_cftypes(struct cftype *cfts)
  */
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
-	struct cftype_set *set;
 	int ret;
 
-	set = kzalloc(sizeof(*set), GFP_KERNEL);
-	if (!set)
-		return -ENOMEM;
-
 	ret = cgroup_init_cftypes(ss, cfts);
 	if (ret)
 		return ret;
 
 	cgroup_cfts_prepare();
-	set->cfts = cfts;
-	list_add_tail(&set->node, &ss->cftsets);
+	list_add_tail(&cfts->node, &ss->cfts);
 	ret = cgroup_cfts_commit(cfts, true);
 	if (ret)
 		cgroup_rm_cftypes(cfts);
@@ -3574,13 +3559,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 
 	/* process cftsets of each subsystem */
 	for_each_subsys(ss, i) {
-		struct cftype_set *set;
+		struct cftype *cfts;
 
 		if (!test_bit(i, &subsys_mask))
 			continue;
 
-		list_for_each_entry(set, &ss->cftsets, node) {
-			ret = cgroup_addrm_files(cgrp, set->cfts, true);
+		list_for_each_entry(cfts, &ss->cfts, node) {
+			ret = cgroup_addrm_files(cgrp, cfts, true);
 			if (ret < 0)
 				goto err;
 		}
@@ -4169,7 +4154,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
-	INIT_LIST_HEAD(&ss->cftsets);
+	INIT_LIST_HEAD(&ss->cfts);
 
 	/* Create the top cgroup state for this subsystem */
 	ss->root = &cgroup_dummy_root;
-- 
cgit v1.3-14-g43fede


From 21a2d3430ba8c188af405a5c2eb9c06bdcb6add6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:49 -0500
Subject: cgroup: simplify dynamic cftype addition and removal

Dynamic cftype addition and removal using cgroup_add/rm_cftypes()
respectively has been quite hairy due to vfs i_mutex.  As i_mutex
nests outside cgroup_mutex, cgroup_mutex has to be released and
regrabbed on each iteration through the hierarchy complicating the
process.  Now that i_mutex is no longer in play, it can be simplified.

* Just holding cgroup_tree_mutex is enough.  No need to meddle with
  cgroup_mutex.

* No reason to play the unlock - relock - check serial_nr dancing.
  Everything can be atomically while holding cgroup_tree_mutex.

* cgroup_cfts_prepare() is replaced with direct locking of
  cgroup_tree_mutex.

* cgroup_cfts_commit() no longer fiddles with locking.  It just
  applies the cftypes change to the existing cgroups in the hierarchy.
  Renamed to cgroup_cfts_apply().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 87 +++++++++++++++++++++------------------------------------
 1 file changed, 32 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 506ebd61d1c2..f4409715a2f5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2305,46 +2305,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 	return 0;
 }
 
-static void cgroup_cfts_prepare(void)
-	__acquires(&cgroup_mutex)
-{
-	/*
-	 * Thanks to the entanglement with vfs inode locking, we can't walk
-	 * the existing cgroups under cgroup_mutex and create files.
-	 * Instead, we use css_for_each_descendant_pre() and drop RCU read
-	 * lock before calling cgroup_addrm_files().
-	 */
-	mutex_lock(&cgroup_tree_mutex);
-	mutex_lock(&cgroup_mutex);
-}
-
-static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
-	__releases(&cgroup_mutex)
+static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
 	struct cgroup *root = &ss->root->top_cgroup;
-	struct cgroup *prev = NULL;
 	struct cgroup_subsys_state *css;
-	u64 update_before;
 	int ret = 0;
 
-	mutex_unlock(&cgroup_mutex);
+	lockdep_assert_held(&cgroup_tree_mutex);
 
-	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-	if (!cfts || ss->root == &cgroup_dummy_root) {
-		mutex_unlock(&cgroup_tree_mutex);
+	/* don't bother if @ss isn't attached */
+	if (ss->root == &cgroup_dummy_root)
 		return 0;
-	}
-
-	cgroup_get_root(ss->root);
-
-	/*
-	 * All cgroups which are created after we drop cgroup_mutex will
-	 * have the updated set of files, so we only need to update the
-	 * cgroups created before the current @cgroup_serial_nr_next.
-	 */
-	update_before = cgroup_serial_nr_next;
 
 	/* add/rm files for all cgroups created before */
 	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
@@ -2353,22 +2326,13 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
 		if (cgroup_is_dead(cgrp))
 			continue;
 
-		cgroup_get(cgrp);
-		if (prev)
-			cgroup_put(prev);
-		prev = cgrp;
-
-		if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) {
-			ret = cgroup_addrm_files(cgrp, cfts, is_add);
-			if (is_add)
-				kernfs_activate(cgrp->kn);
-		}
+		ret = cgroup_addrm_files(cgrp, cfts, is_add);
 		if (ret)
 			break;
 	}
-	mutex_unlock(&cgroup_tree_mutex);
-	cgroup_put(prev);
-	cgroup_put_root(ss->root);
+
+	if (is_add && !ret)
+		kernfs_activate(root->kn);
 	return ret;
 }
 
@@ -2419,6 +2383,19 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	return 0;
 }
 
+static int cgroup_rm_cftypes_locked(struct cftype *cfts)
+{
+	lockdep_assert_held(&cgroup_tree_mutex);
+
+	if (!cfts || !cfts[0].ss)
+		return -ENOENT;
+
+	list_del(&cfts->node);
+	cgroup_apply_cftypes(cfts, false);
+	cgroup_exit_cftypes(cfts);
+	return 0;
+}
+
 /**
  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
  * @cfts: zero-length name terminated array of cftypes
@@ -2432,15 +2409,12 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
  */
 int cgroup_rm_cftypes(struct cftype *cfts)
 {
-	if (!cfts || !cfts[0].ss)
-		return -ENOENT;
-
-	cgroup_cfts_prepare();
-	list_del(&cfts->node);
-	cgroup_cfts_commit(cfts, false);
+	int ret;
 
-	cgroup_exit_cftypes(cfts);
-	return 0;
+	mutex_lock(&cgroup_tree_mutex);
+	ret = cgroup_rm_cftypes_locked(cfts);
+	mutex_unlock(&cgroup_tree_mutex);
+	return ret;
 }
 
 /**
@@ -2465,11 +2439,14 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	if (ret)
 		return ret;
 
-	cgroup_cfts_prepare();
+	mutex_lock(&cgroup_tree_mutex);
+
 	list_add_tail(&cfts->node, &ss->cfts);
-	ret = cgroup_cfts_commit(cfts, true);
+	ret = cgroup_apply_cftypes(cfts, true);
 	if (ret)
-		cgroup_rm_cftypes(cfts);
+		cgroup_rm_cftypes_locked(cfts);
+
+	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
-- 
cgit v1.3-14-g43fede


From 6f30558f37bfbd428e3854c2c34b5c32117c8f7e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: make cgroup hold onto its kernfs_node

cgroup currently releases its kernfs_node when it gets removed.  While
not buggy, this makes cgroup->kn access rules complicated than
necessary and leads to things like get/put protection around
kernfs_remove() in cgroup_destroy_locked().  In addition, we want to
use kernfs_name/path() and friends but also want to be able to
determine a cgroup's name between removal and release.

This patch makes cgroup hold onto its kernfs_node until freed so that
cgroup->kn is always accessible.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4409715a2f5..59dfb025f1ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -957,6 +957,8 @@ static void cgroup_free_fn(struct work_struct *work)
 
 	cgroup_pidlist_destroy_all(cgrp);
 
+	kernfs_put(cgrp->kn);
+
 	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
@@ -3786,6 +3788,12 @@ static long cgroup_create(struct cgroup *parent, const char *name_str,
 	}
 	cgrp->kn = kn;
 
+	/*
+	 * This extra ref will be put in cgroup_free_fn() and guarantees
+	 * that @cgrp->kn is always accessible.
+	 */
+	kernfs_get(kn);
+
 	cgrp->serial_nr = cgroup_serial_nr_next++;
 
 	/* allocation complete, commit to creation */
@@ -3966,7 +3974,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 {
 	struct cgroup *child;
 	struct cgroup_subsys_state *css;
-	struct kernfs_node *kn;
 	bool empty;
 	int ssid;
 
@@ -4044,13 +4051,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * clearing of cgrp->kn->priv backpointer, which should happen
 	 * after all files under it have been removed.
 	 */
-	kn = cgrp->kn;
-	kernfs_get(kn);
-
-	kernfs_remove(cgrp->kn);
-
+	kernfs_remove(cgrp->kn);	/* @cgrp has an extra ref on its kn */
 	RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
-	kernfs_put(kn);
 
 	mutex_lock(&cgroup_mutex);
 
-- 
cgit v1.3-14-g43fede


From e61734c55c24cdf11b07e52a74aec4dc4a7f4bd0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: remove cgroup->name

cgroup->name handling became quite complicated over time involving
dedicated struct cgroup_name for RCU protection.  Now that cgroup is
on kernfs, we can drop all of it and simply use kernfs_name/path() and
friends.  Replace cgroup->name and all related code with kernfs
name/path constructs.

* Reimplement cgroup_name() and cgroup_path() as thin wrappers on top
  of kernfs counterparts, which involves semantic changes.
  pr_cont_cgroup_name() and pr_cont_cgroup_path() added.

* cgroup->name handling dropped from cgroup_rename().

* All users of cgroup_name/path() updated to the new semantics.  Users
  which were formatting the string just to printk them are converted
  to use pr_cont_cgroup_name/path() instead, which simplifies things
  quite a bit.  As cgroup_name() no longer requires RCU read lock
  around it, RCU lockings which were protecting only cgroup_name() are
  removed.

v2: Comment above oom_info_lock updated as suggested by Michal.

v3: dummy_top doesn't have a kn associated and
    pr_cont_cgroup_name/path() ended up calling the matching kernfs
    functions with NULL kn leading to oops.  Test for NULL kn and
    print "/" if so.  This issue was reported by Fengguang Wu.

v4: Rebased on top of 0ab02ca8f887 ("cgroup: protect modifications to
    cgroup_idr with cgroup_mutex").

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 block/blk-cgroup.h     |  12 ++--
 fs/kernfs/dir.c        |   1 +
 include/linux/cgroup.h |  63 ++++++++++++---------
 kernel/cgroup.c        | 146 +++++++++++--------------------------------------
 kernel/cpuset.c        |  27 +++++----
 kernel/sched/debug.c   |   3 +-
 mm/memcontrol.c        |  68 ++++++-----------------
 7 files changed, 110 insertions(+), 210 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 453b528c8e19..15a8d640de57 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
  */
 static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 {
-	int ret;
+	char *p;
 
-	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-	if (ret)
+	p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+	if (!p) {
 		strncpy(buf, "<unavailable>", buflen);
-	return ret;
+		return -ENAMETOOLONG;
+	}
+
+	memmove(buf, p, buf + buflen - p);
+	return 0;
 }
 
 /**
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a347792c2e5a..939684ebff1e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -112,6 +112,7 @@ char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
 	spin_unlock_irqrestore(&kernfs_rename_lock, flags);
 	return p;
 }
+EXPORT_SYMBOL_GPL(kernfs_path);
 
 /**
  * pr_cont_kernfs_name - pr_cont name of a kernfs_node
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b42251a23129..4d6ff7d40cf6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,11 +138,6 @@ enum {
 	CGRP_SANE_BEHAVIOR,
 };
 
-struct cgroup_name {
-	struct rcu_head rcu_head;
-	char name[];
-};
-
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -179,19 +174,6 @@ struct cgroup {
 	 */
 	u64 serial_nr;
 
-	/*
-	 * This is a copy of dentry->d_name, and it's needed because
-	 * we can't use dentry->d_name in cgroup_path().
-	 *
-	 * You must acquire rcu_read_lock() to access cgrp->name, and
-	 * the only place that can change it is rename(), which is
-	 * protected by parent dir's i_mutex.
-	 *
-	 * Normally you should use cgroup_name() wrapper rather than
-	 * access it directly.
-	 */
-	struct cgroup_name __rcu *name;
-
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -479,12 +461,6 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
 	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
 }
 
-/* Caller should hold rcu_read_lock() */
-static inline const char *cgroup_name(const struct cgroup *cgrp)
-{
-	return rcu_dereference(cgrp->name)->name;
-}
-
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
@@ -503,14 +479,47 @@ static inline struct cftype *seq_cft(struct seq_file *seq)
 
 struct cgroup_subsys_state *seq_css(struct seq_file *seq);
 
+/*
+ * Name / path handling functions.  All are thin wrappers around the kernfs
+ * counterparts and can be called under any context.
+ */
+
+static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+	return kernfs_name(cgrp->kn, buf, buflen);
+}
+
+static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
+					      size_t buflen)
+{
+	return kernfs_path(cgrp->kn, buf, buflen);
+}
+
+static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
+{
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		pr_cont_kernfs_name(cgrp->kn);
+	else
+		pr_cont("/");
+}
+
+static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
+{
+	/* dummy_top doesn't have a kn associated */
+	if (cgrp->kn)
+		pr_cont_kernfs_path(cgrp->kn);
+	else
+		pr_cont("/");
+}
+
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 59dfb025f1ac..638df032fb94 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -145,8 +145,6 @@ static int cgroup_root_count;
 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 static DEFINE_IDR(cgroup_hierarchy_idr);
 
-static struct cgroup_name root_cgroup_name = { .name = "/" };
-
 /*
  * Assign a monotonically increasing serial number to cgroups.  It
  * guarantees cgroups with bigger numbers are newer than those with smaller
@@ -888,17 +886,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
-static struct cgroup_name *cgroup_alloc_name(const char *name_str)
-{
-	struct cgroup_name *name;
-
-	name = kmalloc(sizeof(*name) + strlen(name_str) + 1, GFP_KERNEL);
-	if (!name)
-		return NULL;
-	strcpy(name->name, name_str);
-	return name;
-}
-
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
@@ -958,8 +945,6 @@ static void cgroup_free_fn(struct work_struct *work)
 	cgroup_pidlist_destroy_all(cgrp);
 
 	kernfs_put(cgrp->kn);
-
-	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
 
@@ -1377,7 +1362,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	INIT_LIST_HEAD(&root->root_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
-	RCU_INIT_POINTER(cgrp->name, &root_cgroup_name);
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
 }
@@ -1597,57 +1581,6 @@ static struct file_system_type cgroup_fs_type = {
 
 static struct kobject *cgroup_kobj;
 
-/**
- * cgroup_path - generate the path of a cgroup
- * @cgrp: the cgroup in question
- * @buf: the buffer to write the path into
- * @buflen: the length of the buffer
- *
- * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
- *
- * We can't generate cgroup path using dentry->d_name, as accessing
- * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
- * inode's i_mutex, while on the other hand cgroup_path() can be called
- * with some irq-safe spinlocks held.
- */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
-{
-	int ret = -ENAMETOOLONG;
-	char *start;
-
-	if (!cgrp->parent) {
-		if (strlcpy(buf, "/", buflen) >= buflen)
-			return -ENAMETOOLONG;
-		return 0;
-	}
-
-	start = buf + buflen - 1;
-	*start = '\0';
-
-	rcu_read_lock();
-	do {
-		const char *name = cgroup_name(cgrp);
-		int len;
-
-		len = strlen(name);
-		if ((start -= len) < buf)
-			goto out;
-		memcpy(start, name, len);
-
-		if (--start < buf)
-			goto out;
-		*start = '/';
-
-		cgrp = cgrp->parent;
-	} while (cgrp->parent);
-	ret = 0;
-	memmove(buf, start, buf + buflen - start);
-out:
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cgroup_path);
-
 /**
  * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
  * @task: target task
@@ -1659,16 +1592,14 @@ EXPORT_SYMBOL_GPL(cgroup_path);
  * function grabs cgroup_mutex and shouldn't be used inside locks used by
  * cgroup controller callbacks.
  *
- * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
+ * Return value is the same as kernfs_path().
  */
-int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
 	struct cgroupfs_root *root;
 	struct cgroup *cgrp;
-	int hierarchy_id = 1, ret = 0;
-
-	if (buflen < 2)
-		return -ENAMETOOLONG;
+	int hierarchy_id = 1;
+	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
 
@@ -1676,14 +1607,15 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 
 	if (root) {
 		cgrp = task_cgroup_from_root(task, root);
-		ret = cgroup_path(cgrp, buf, buflen);
+		path = cgroup_path(cgrp, buf, buflen);
 	} else {
 		/* if no hierarchy exists, everyone is in "/" */
-		memcpy(buf, "/", 2);
+		if (strlcpy(buf, "/", buflen) < buflen)
+			path = buf;
 	}
 
 	mutex_unlock(&cgroup_mutex);
-	return ret;
+	return path;
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
@@ -2211,7 +2143,6 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 			 const char *new_name_str)
 {
 	struct cgroup *cgrp = kn->priv;
-	struct cgroup_name *name, *old_name;
 	int ret;
 
 	if (kernfs_type(kn) != KERNFS_DIR)
@@ -2226,25 +2157,13 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 
-	name = cgroup_alloc_name(new_name_str);
-	if (!name)
-		return -ENOMEM;
-
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	ret = kernfs_rename(kn, new_parent, new_name_str);
-	if (!ret) {
-		old_name = rcu_dereference_protected(cgrp->name, true);
-		rcu_assign_pointer(cgrp->name, name);
-	} else {
-		old_name = name;
-	}
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
-
-	kfree_rcu(old_name, rcu_head);
 	return ret;
 }
 
@@ -3719,14 +3638,13 @@ err_free:
 /**
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
- * @name_str: name of the new cgroup
+ * @name: name of the new cgroup
  * @mode: mode to set on new cgroup
  */
-static long cgroup_create(struct cgroup *parent, const char *name_str,
+static long cgroup_create(struct cgroup *parent, const char *name,
 			  umode_t mode)
 {
 	struct cgroup *cgrp;
-	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
@@ -3737,13 +3655,6 @@ static long cgroup_create(struct cgroup *parent, const char *name_str,
 	if (!cgrp)
 		return -ENOMEM;
 
-	name = cgroup_alloc_name(name_str);
-	if (!name) {
-		err = -ENOMEM;
-		goto err_free_cgrp;
-	}
-	rcu_assign_pointer(cgrp->name, name);
-
 	mutex_lock(&cgroup_tree_mutex);
 
 	/*
@@ -3781,7 +3692,7 @@ static long cgroup_create(struct cgroup *parent, const char *name_str,
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
 
 	/* create the directory */
-	kn = kernfs_create_dir(parent->kn, name->name, mode, cgrp);
+	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
 	if (IS_ERR(kn)) {
 		err = PTR_ERR(kn);
 		goto err_free_id;
@@ -3839,8 +3750,6 @@ err_unlock:
 	mutex_unlock(&cgroup_mutex);
 err_unlock_tree:
 	mutex_unlock(&cgroup_tree_mutex);
-	kfree(rcu_dereference_raw(cgrp->name));
-err_free_cgrp:
 	kfree(cgrp);
 	return err;
 
@@ -4304,12 +4213,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
-	char *buf;
+	char *buf, *path;
 	int retval;
 	struct cgroupfs_root *root;
 
 	retval = -ENOMEM;
-	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		goto out;
 
@@ -4337,10 +4246,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
-		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
-		if (retval < 0)
+		path = cgroup_path(cgrp, buf, PATH_MAX);
+		if (!path) {
+			retval = -ENAMETOOLONG;
 			goto out_unlock;
-		seq_puts(m, buf);
+		}
+		seq_puts(m, path);
 		seq_putc(m, '\n');
 	}
 
@@ -4588,16 +4499,17 @@ static void cgroup_release_agent(struct work_struct *work)
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
 		int i;
-		char *pathbuf = NULL, *agentbuf = NULL;
+		char *pathbuf = NULL, *agentbuf = NULL, *path;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
 						    release_list);
 		list_del_init(&cgrp->release_list);
 		raw_spin_unlock(&release_list_lock);
-		pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!pathbuf)
 			goto continue_free;
-		if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
+		path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+		if (!path)
 			goto continue_free;
 		agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
 		if (!agentbuf)
@@ -4605,7 +4517,7 @@ static void cgroup_release_agent(struct work_struct *work)
 
 		i = 0;
 		argv[i++] = agentbuf;
-		argv[i++] = pathbuf;
+		argv[i++] = path;
 		argv[i] = NULL;
 
 		i = 0;
@@ -4755,6 +4667,11 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 {
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
+	char *name_buf;
+
+	name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+	if (!name_buf)
+		return -ENOMEM;
 
 	read_lock(&css_set_lock);
 	rcu_read_lock();
@@ -4763,14 +4680,17 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		struct cgroup *c = link->cgrp;
 		const char *name = "?";
 
-		if (c != cgroup_dummy_top)
-			name = cgroup_name(c);
+		if (c != cgroup_dummy_top) {
+			cgroup_name(c, name_buf, NAME_MAX + 1);
+			name = name_buf;
+		}
 
 		seq_printf(seq, "Root %d group %s\n",
 			   c->root->hierarchy_id, name);
 	}
 	rcu_read_unlock();
 	read_unlock(&css_set_lock);
+	kfree(name_buf);
 	return 0;
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2d018c795fea..e97a6e88d036 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2088,10 +2088,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 		parent = parent_cs(parent);
 
 	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
-		rcu_read_lock();
-		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
-		       cgroup_name(cs->css.cgroup));
-		rcu_read_unlock();
+		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
+		pr_cont_cgroup_name(cs->css.cgroup);
+		pr_cont("\n");
 	}
 }
 
@@ -2619,19 +2618,17 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 	 /* Statically allocated to prevent using excess stack. */
 	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 	static DEFINE_SPINLOCK(cpuset_buffer_lock);
-
 	struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
 
-	rcu_read_lock();
 	spin_lock(&cpuset_buffer_lock);
 
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
-	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
-	       tsk->comm, cgroup_name(cgrp), cpuset_nodelist);
+	printk(KERN_INFO "%s cpuset=", tsk->comm);
+	pr_cont_cgroup_name(cgrp);
+	pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
 
 	spin_unlock(&cpuset_buffer_lock);
-	rcu_read_unlock();
 }
 
 /*
@@ -2681,12 +2678,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
-	char *buf;
+	char *buf, *p;
 	struct cgroup_subsys_state *css;
 	int retval;
 
 	retval = -ENOMEM;
-	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (!buf)
 		goto out;
 
@@ -2696,14 +2693,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
 	if (!tsk)
 		goto out_free;
 
+	retval = -ENAMETOOLONG;
 	rcu_read_lock();
 	css = task_css(tsk, cpuset_cgrp_id);
-	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+	p = cgroup_path(css->cgroup, buf, PATH_MAX);
 	rcu_read_unlock();
-	if (retval < 0)
+	if (!p)
 		goto out_put_task;
-	seq_puts(m, buf);
+	seq_puts(m, p);
 	seq_putc(m, '\n');
+	retval = 0;
 out_put_task:
 	put_task_struct(tsk);
 out_free:
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..30eee3b5293d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
 	if (autogroup_path(tg, group_path, PATH_MAX))
 		return group_path;
 
-	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
-	return group_path;
+	return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
 }
 #endif
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 102ab48ffa13..c1c25494f7ae 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-	/*
-	 * protects memcg_name and makes sure that parallel ooms do not
-	 * interleave
-	 */
+	/* oom_info_lock ensures that parallel ooms do not interleave */
 	static DEFINE_SPINLOCK(oom_info_lock);
-	struct cgroup *task_cgrp;
-	struct cgroup *mem_cgrp;
-	static char memcg_name[PATH_MAX];
-	int ret;
 	struct mem_cgroup *iter;
 	unsigned int i;
 
@@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 	spin_lock(&oom_info_lock);
 	rcu_read_lock();
 
-	mem_cgrp = memcg->css.cgroup;
-	task_cgrp = task_cgroup(p, memory_cgrp_id);
+	pr_info("Task in ");
+	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
+	pr_info(" killed as a result of limit of ");
+	pr_cont_cgroup_path(memcg->css.cgroup);
+	pr_info("\n");
 
-	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
-	if (ret < 0) {
-		/*
-		 * Unfortunately, we are unable to convert to a useful name
-		 * But we'll still print out the usage information
-		 */
-		rcu_read_unlock();
-		goto done;
-	}
 	rcu_read_unlock();
 
-	pr_info("Task in %s killed", memcg_name);
-
-	rcu_read_lock();
-	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
-	if (ret < 0) {
-		rcu_read_unlock();
-		goto done;
-	}
-	rcu_read_unlock();
-
-	/*
-	 * Continues from above, so we don't need an KERN_ level
-	 */
-	pr_cont(" as a result of limit of %s\n", memcg_name);
-done:
-
 	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
@@ -1745,13 +1716,8 @@ done:
 		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 
 	for_each_mem_cgroup_tree(iter, memcg) {
-		pr_info("Memory cgroup stats");
-
-		rcu_read_lock();
-		ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX);
-		if (!ret)
-			pr_cont(" for %s", memcg_name);
-		rcu_read_unlock();
+		pr_info("Memory cgroup stats for ");
+		pr_cont_cgroup_path(iter->css.cgroup);
 		pr_cont(":");
 
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 						  struct kmem_cache *s)
 {
 	struct kmem_cache *new = NULL;
-	static char *tmp_name = NULL;
+	static char *tmp_path = NULL, *tmp_name = NULL;
 	static DEFINE_MUTEX(mutex);	/* protects tmp_name */
 
 	BUG_ON(!memcg_can_account_kmem(memcg));
@@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
 	 * This static temporary buffer is used to prevent from
 	 * pointless shortliving allocation.
 	 */
-	if (!tmp_name) {
-		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!tmp_path || !tmp_name) {
+		if (!tmp_path)
+			tmp_path = kmalloc(PATH_MAX, GFP_KERNEL);
 		if (!tmp_name)
+			tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+		if (!tmp_path || !tmp_name)
 			goto out;
 	}
 
-	rcu_read_lock();
-	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
-			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
-	rcu_read_unlock();
+	cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1);
+	snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name,
+		 memcg_cache_id(memcg), tmp_name);
 
-	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
+	new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align,
 				      (s->flags & ~SLAB_PANIC), s->ctor, s);
 	if (new)
 		new->allocflags |= __GFP_KMEMCG;
-- 
cgit v1.3-14-g43fede


From 3c9c825b8b50de7dbb015e6bfc04bb2da79364d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: rename cgroupfs_root->number_of_cgroups to ->nr_cgrps and
 make it atomic_t

root->number_of_cgroups is currently an integer protected with
cgroup_mutex.  Except for sanity checks and proc reporting, the only
place it's used is to check whether the root has any child during
remount; however, this is a bit flawed as the counter is not
decremented when the cgroup is unlinked but when it's released,
meaning that there could be an extended period where all cgroups are
removed but remount is still not allowed because some internal objects
are lingering.  While not perfect either, it'd be better to use
emptiness test on root->top_cgroup.children.

This patch updates cgroup_remount() to test top_cgroup's children
instead, which makes number_of_cgroups only actual usage statistics
printing in proc implemented in proc_cgroupstats_show().  Let's
shorten its name and make it an atomic_t so that we don't have to
worry about its synchronization.  It's purely auxiliary at this point.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 16 +++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4d6ff7d40cf6..f0e6105bbbc1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -290,8 +290,8 @@ struct cgroupfs_root {
 	/* The root cgroup for this hierarchy */
 	struct cgroup top_cgroup;
 
-	/* Tracks how many cgroups are currently defined in hierarchy.*/
-	int number_of_cgroups;
+	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
+	atomic_t nr_cgrps;
 
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 638df032fb94..cffdb6e2ad08 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -758,7 +758,7 @@ static void cgroup_put_root(struct cgroupfs_root *root)
 	}
 	mutex_lock(&cgroup_mutex);
 
-	BUG_ON(root->number_of_cgroups != 1);
+	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
@@ -928,9 +928,7 @@ static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 
-	mutex_lock(&cgroup_mutex);
-	cgrp->root->number_of_cgroups--;
-	mutex_unlock(&cgroup_mutex);
+	atomic_dec(&cgrp->root->nr_cgrps);
 
 	/*
 	 * We get a ref to the parent, and put the ref when this cgroup is
@@ -1320,7 +1318,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	}
 
 	/* remounting is not allowed for populated hierarchies */
-	if (root->number_of_cgroups > 1) {
+	if (!list_empty(&root->top_cgroup.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -1360,7 +1358,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 
 	atomic_set(&root->refcnt, 1);
 	INIT_LIST_HEAD(&root->root_list);
-	root->number_of_cgroups = 1;
+	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
@@ -1463,7 +1461,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
 	write_unlock(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->children));
-	BUG_ON(root->number_of_cgroups != 1);
+	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
 
 	kernfs_activate(root_cgrp->kn);
 	ret = 0;
@@ -3709,7 +3707,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
-	root->number_of_cgroups++;
+	atomic_inc(&root->nr_cgrps);
 
 	/*
 	 * Grab a reference on the root and parent so that they don't get
@@ -4281,7 +4279,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
-			   ss->root->number_of_cgroups, !ss->disabled);
+			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 
 	mutex_unlock(&cgroup_mutex);
 	return 0;
-- 
cgit v1.3-14-g43fede


From 776f02fa4e1ad70557c0318c70ce928e0642bee0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 09:29:50 -0500
Subject: cgroup: remove cgroupfs_root->refcnt

Currently, cgroupfs_root and its ->top_cgroup are separated reference
counted and the latter's is ignored.  There's no reason to do this
separately.  This patch removes cgroupfs_root->refcnt and destroys
cgroupfs_root when the top_cgroup is released.

* cgroup_put() updated to ignore cgroup_is_dead() test for top
  cgroups.  cgroup_free_fn() updated to handle root destruction when
  releasing a top cgroup.

* As root destruction is now bounced through cgroup destruction, it is
  asynchronous.  Update cgroup_mount() so that it waits for pending
  release which is currently implemented using msleep().  Converting
  this to proper wait_queue isn't hard but likely unnecessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  4 +--
 kernel/cgroup.c        | 86 ++++++++++++++++++++++----------------------------
 2 files changed, 39 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f0e6105bbbc1..298d616e8f40 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -282,12 +282,10 @@ struct cgroupfs_root {
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
-	atomic_t refcnt;
-
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* The root cgroup for this hierarchy */
+	/* The root cgroup.  Root is destroyed on its release. */
 	struct cgroup top_cgroup;
 
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cffdb6e2ad08..03845c5d082b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -53,6 +53,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 #include <linux/atomic.h>
 
@@ -728,37 +729,16 @@ static void cgroup_free_root(struct cgroupfs_root *root)
 	}
 }
 
-static void cgroup_get_root(struct cgroupfs_root *root)
-{
-	/*
-	 * The caller must ensure that @root is alive, which can be
-	 * achieved by holding a ref on one of the member cgroups or
-	 * following a registered reference to @root while holding
-	 * cgroup_tree_mutex.
-	 */
-	WARN_ON_ONCE(atomic_read(&root->refcnt) <= 0);
-	atomic_inc(&root->refcnt);
-}
-
-static void cgroup_put_root(struct cgroupfs_root *root)
+static void cgroup_destroy_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgrp_cset_link *link, *tmp_link;
 	int ret;
 
-	/*
-	 * @root's refcnt reaching zero and its deregistration should be
-	 * atomic w.r.t. cgroup_tree_mutex.  This ensures that
-	 * cgroup_get_root() is safe to invoke if @root is registered.
-	 */
 	mutex_lock(&cgroup_tree_mutex);
-	if (!atomic_dec_and_test(&root->refcnt)) {
-		mutex_unlock(&cgroup_tree_mutex);
-		return;
-	}
 	mutex_lock(&cgroup_mutex);
 
-	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
+	BUG_ON(atomic_read(&root->nr_cgrps));
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
@@ -929,21 +909,24 @@ static void cgroup_free_fn(struct work_struct *work)
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 
 	atomic_dec(&cgrp->root->nr_cgrps);
-
-	/*
-	 * We get a ref to the parent, and put the ref when this cgroup is
-	 * being freed, so it's guaranteed that the parent won't be
-	 * destroyed before its children.
-	 */
-	cgroup_put(cgrp->parent);
-
-	/* put the root reference that we took when we created the cgroup */
-	cgroup_put_root(cgrp->root);
-
 	cgroup_pidlist_destroy_all(cgrp);
 
-	kernfs_put(cgrp->kn);
-	kfree(cgrp);
+	if (cgrp->parent) {
+		/*
+		 * We get a ref to the parent, and put the ref when this
+		 * cgroup is being freed, so it's guaranteed that the
+		 * parent won't be destroyed before its children.
+		 */
+		cgroup_put(cgrp->parent);
+		kernfs_put(cgrp->kn);
+		kfree(cgrp);
+	} else {
+		/*
+		 * This is top cgroup's refcnt reaching zero, which
+		 * indicates that the root should be released.
+		 */
+		cgroup_destroy_root(cgrp->root);
+	}
 }
 
 static void cgroup_free_rcu(struct rcu_head *head)
@@ -965,7 +948,7 @@ static void cgroup_put(struct cgroup *cgrp)
 {
 	if (!atomic_dec_and_test(&cgrp->refcnt))
 		return;
-	if (WARN_ON_ONCE(!cgroup_is_dead(cgrp)))
+	if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
 		return;
 
 	/*
@@ -1356,7 +1339,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 
-	atomic_set(&root->refcnt, 1);
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
@@ -1485,7 +1467,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
-
+retry:
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -1531,7 +1513,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			}
 		}
 
-		cgroup_get_root(root);
+		/*
+		 * A root's lifetime is governed by its top cgroup.  Zero
+		 * ref indicate that the root is being destroyed.  Wait for
+		 * destruction to complete so that the subsystems are free.
+		 * We can use wait_queue for the wait but this path is
+		 * super cold.  Let's just sleep for a bit and retry.
+		 */
+		if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
+			mutex_unlock(&cgroup_mutex);
+			mutex_unlock(&cgroup_tree_mutex);
+			msleep(10);
+			goto retry;
+		}
+
+		ret = 0;
 		goto out_unlock;
 	}
 
@@ -1558,7 +1554,7 @@ out_unlock:
 
 	dentry = kernfs_mount(fs_type, flags, root->kf_root);
 	if (IS_ERR(dentry))
-		cgroup_put_root(root);
+		cgroup_put(&root->top_cgroup);
 	return dentry;
 }
 
@@ -1567,7 +1563,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
 
-	cgroup_put_root(root);
+	cgroup_put(&root->top_cgroup);
 	kernfs_kill_sb(sb);
 }
 
@@ -3708,12 +3704,6 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 	/* allocation complete, commit to creation */
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	atomic_inc(&root->nr_cgrps);
-
-	/*
-	 * Grab a reference on the root and parent so that they don't get
-	 * deleted while there are child cgroups.
-	 */
-	cgroup_get_root(root);
 	cgroup_get(parent);
 
 	/*
-- 
cgit v1.3-14-g43fede


From 1a11533fbd71792e8c5d36f6763fbce8df0d231d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 19:06:19 -0500
Subject: Revert "cgroup: use an ordered workqueue for cgroup destruction"

This reverts commit ab3f5faa6255a0eb4f832675507d9e295ca7e9ba.
Explanation from Hugh:

  It's because more thorough testing, by others here, found that it
  wasn't always solving the problem: so I asked Tejun privately to
  hold off from sending it in, until we'd worked out why not.

  Most of our testing being on a v3,11-based kernel, it was perfectly
  possible that the problem was merely our own e.g. missing Tejun's
  8a2b75384444 ("workqueue: fix ordered workqueues in NUMA setups").

  But that turned out not to be enough to fix it either. Then Filipe
  pointed out how percpu_ref_kill_and_confirm() uses call_rcu_sched()
  before we ever get to put the offline on to the workqueue: by the
  time we get to the workqueue, the ordering has already been lost.

  So, thanks for the Acks, but I'm afraid that this ordered workqueue
  solution is just not good enough: we should simply forget that patch
  and provide a different answer."

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
---
 kernel/cgroup.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 52719ce55dd3..68d87103b493 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4844,16 +4844,12 @@ static int __init cgroup_wq_init(void)
 	/*
 	 * There isn't much point in executing destruction path in
 	 * parallel.  Good chunk is serialized with cgroup_mutex anyway.
-	 *
-	 * XXX: Must be ordered to make sure parent is offlined after
-	 * children.  The ordering requirement is for memcg where a
-	 * parent's offline may wait for a child's leading to deadlock.  In
-	 * the long term, this should be fixed from memcg side.
+	 * Use 1 for @max_active.
 	 *
 	 * We would prefer to do this in cgroup_init() above, but that
 	 * is called before init_workqueues(): so leave this until after.
 	 */
-	cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0);
+	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
 	BUG_ON(!cgroup_destroy_wq);
 
 	/*
-- 
cgit v1.3-14-g43fede


From d3ba07c3aa9ae3e03329b0a7f1a067c0647aa2af Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:38 -0500
Subject: cgroup: disallow xattr, release_agent and name if sane_behavior

Disallow more mount options if sane_behavior.  Note that xattr used to
generate warning.

While at it, simplify option check in cgroup_mount() and update
sane_behavior comment in cgroup.h.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  6 +++---
 kernel/cgroup.c        | 14 ++++----------
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 298d616e8f40..5f669ca0ee36 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -227,8 +227,8 @@ enum {
 	 *
 	 * The followings are the behaviors currently affected this flag.
 	 *
-	 * - Mount options "noprefix" and "clone_children" are disallowed.
-	 *   Also, cgroupfs file cgroup.clone_children is not created.
+	 * - Mount options "noprefix", "xattr", "clone_children",
+	 *   "release_agent" and "name" are disallowed.
 	 *
 	 * - When mounting an existing superblock, mount options should
 	 *   match.
@@ -246,7 +246,7 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
-	 * - "xattr" mount option is deprecated.  kernfs always enables it.
+	 * - "cgroup.clone_children" is removed.
 	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 03845c5d082b..079c478a4735 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1226,18 +1226,12 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
 		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
 
-		if (opts->flags & CGRP_ROOT_NOPREFIX) {
-			pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+		if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
+		    opts->cpuset_clone_children || opts->release_agent ||
+		    opts->name) {
+			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
 			return -EINVAL;
 		}
-
-		if (opts->cpuset_clone_children) {
-			pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
-			return -EINVAL;
-		}
-
-		if (opts->flags & CGRP_ROOT_XATTR)
-			pr_warning("cgroup: sane_behavior: xattr is always available, flag unnecessary\n");
 	}
 
 	/*
-- 
cgit v1.3-14-g43fede


From 35585573055f37837eb752ee22eb5523682ca742 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:38 -0500
Subject: cgroup: drop CGRP_ROOT_SUBSYS_BOUND

Before kernfs conversion, due to the way super_block lookup works,
cgroup roots were created and made visible before being fully
initialized.  This in turn required a special flag to mark that the
root hasn't been fully initialized so that the destruction path can
tell fully bound ones from half initialized.

That flag is CGRP_ROOT_SUBSYS_BOUND and no longer necessary after the
kernfs conversion as the lookup and creation of new root are atomic
w.r.t. cgroup_mutex.  This patch removes the flag and passes the
requests subsystem mask to cgroup_setup_root() so that it can set the
respective mask bits as subsystems are bound.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  2 --
 kernel/cgroup.c        | 28 ++++------------------------
 2 files changed, 4 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5f669ca0ee36..e2ffcdc26cb7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -267,8 +267,6 @@ enum {
 
 	/* mount options live below bit 16 */
 	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
-
-	CGRP_ROOT_SUBSYS_BOUND	= (1 << 16), /* subsystems finished binding */
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 079c478a4735..878cd1810ad1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -733,7 +733,6 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgrp_cset_link *link, *tmp_link;
-	int ret;
 
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -742,11 +741,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
-	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
-		ret = rebind_subsystems(root, 0, root->subsys_mask);
-		/* Shouldn't be able to fail ... */
-		BUG_ON(ret);
-	}
+	WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
@@ -1055,13 +1050,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 		}
 	}
 
-	/*
-	 * Mark @root has finished binding subsystems.  @root->subsys_mask
-	 * now matches the bound subsystems.
-	 */
-	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
 	kernfs_activate(cgrp->kn);
-
 	return 0;
 }
 
@@ -1353,15 +1342,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 
 	init_cgroup_root(root);
 
-	/*
-	 * We need to set @root->subsys_mask now so that @root can be
-	 * matched by cgroup_test_super() before it finishes
-	 * initialization; otherwise, competing mounts with the same
-	 * options may try to bind the same subsystems instead of waiting
-	 * for the first one leading to unexpected mount errors.
-	 * SUBSYS_BOUND will be set once actual binding is complete.
-	 */
-	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	if (opts->release_agent)
 		strcpy(root->release_agent_path, opts->release_agent);
@@ -1372,7 +1352,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 	return root;
 }
 
-static int cgroup_setup_root(struct cgroupfs_root *root)
+static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->top_cgroup;
@@ -1415,7 +1395,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root)
 	if (ret)
 		goto destroy_root;
 
-	ret = rebind_subsystems(root, root->subsys_mask, 0);
+	ret = rebind_subsystems(root, ss_mask, 0);
 	if (ret)
 		goto destroy_root;
 
@@ -1532,7 +1512,7 @@ retry:
 		goto out_unlock;
 	}
 
-	ret = cgroup_setup_root(root);
+	ret = cgroup_setup_root(root, opts.subsys_mask);
 	if (ret)
 		cgroup_free_root(root);
 
-- 
cgit v1.3-14-g43fede


From 56fde9e01de45bcfabbb444d33e8bdd8388d2da0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:38 -0500
Subject: cgroup: enable task_cg_lists on the first cgroup mount

Tasks are not linked on their css_sets until cgroup task iteration is
actually used.  This is to avoid incurring overhead on the fork and
exit paths for systems which have cgroup compiled in but don't use it.

This lazy binding also affects the task migration path.  It has to be
careful so that it doesn't link tasks to css_sets when task_cg_lists
linking is not enabled yet.  Unfortunately, this conditional linking
in the migration path interferes with planned migration updates.

This patch moves the lazy binding a bit earlier, to the first cgroup
mount.  It's a clear indication that cgroup is being used on the
system and task_cg_lists linking is highly likely to be enabled soon
anyway through "tasks" and "cgroup.procs" files.

This allows cgroup_task_migrate() to always link @tsk->cg_list.  Note
that it may still race with cgroup_post_fork() but who wins that race
is inconsequential.

While at it, make use_task_css_set_links a bool, add sanity checks in
cgroup_enable_task_cg_lists() and css_task_iter_start(), and update
the former so that it's guaranteed and assumes to run only once.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 878cd1810ad1..506f6da67ad1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -173,6 +173,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
+static void cgroup_enable_task_cg_lists(void);
 
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -375,7 +376,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
  * fork()/exit() overhead for people who have cgroups compiled into their
  * kernel but not actually in use.
  */
-static int use_task_css_set_links __read_mostly;
+static bool use_task_css_set_links __read_mostly;
 
 static void __put_css_set(struct css_set *cset, int taskexit)
 {
@@ -1441,6 +1442,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
+
+	/*
+	 * The first time anyone tries to mount a cgroup, enable the list
+	 * linking each css_set to its tasks and fix up all existing tasks.
+	 */
+	if (!use_task_css_set_links)
+		cgroup_enable_task_cg_lists();
 retry:
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
@@ -1692,10 +1700,8 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	task_unlock(tsk);
 
-	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
-	if (!list_empty(&tsk->cg_list))
-		list_move(&tsk->cg_list, &new_cset->tasks);
+	list_move(&tsk->cg_list, &new_cset->tasks);
 	write_unlock(&css_set_lock);
 
 	/*
@@ -2362,13 +2368,19 @@ int cgroup_task_count(const struct cgroup *cgrp)
  * To reduce the fork() overhead for systems that are not actually using
  * their cgroups capability, we don't maintain the lists running through
  * each css_set to its tasks until we see the list actually used - in other
- * words after the first call to css_task_iter_start().
+ * words after the first mount.
  */
 static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
+
 	write_lock(&css_set_lock);
-	use_task_css_set_links = 1;
+
+	if (use_task_css_set_links)
+		goto out_unlock;
+
+	use_task_css_set_links = true;
+
 	/*
 	 * We need tasklist_lock because RCU is not safe against
 	 * while_each_thread(). Besides, a forking task that has passed
@@ -2379,16 +2391,22 @@ static void cgroup_enable_task_cg_lists(void)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		task_lock(p);
+
+		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+			     task_css_set(p) != &init_css_set);
+
 		/*
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
 		 */
-		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
+		if (!(p->flags & PF_EXITING))
 			list_add(&p->cg_list, &task_css_set(p)->tasks);
+
 		task_unlock(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
+out_unlock:
 	write_unlock(&css_set_lock);
 }
 
@@ -2621,13 +2639,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
 	__acquires(css_set_lock)
 {
-	/*
-	 * The first time anyone tries to iterate across a css, we need to
-	 * enable the list linking each css_set to its tasks, and fix up
-	 * all existing tasks.
-	 */
-	if (!use_task_css_set_links)
-		cgroup_enable_task_cg_lists();
+	/* no one should try to iterate before mounting cgroups */
+	WARN_ON_ONCE(!use_task_css_set_links);
 
 	read_lock(&css_set_lock);
 
-- 
cgit v1.3-14-g43fede


From afeb0f9fd425239aa477c842480f240bfb6325b3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:39 -0500
Subject: cgroup: relocate cgroup_enable_task_cg_lists()

Move it above so that prototype isn't necessary.  Let's also move the
definition of use_task_css_set_links next to it.

This is purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 103 ++++++++++++++++++++++++++------------------------------
 1 file changed, 48 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 506f6da67ad1..2469699408bd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -173,7 +173,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
-static void cgroup_enable_task_cg_lists(void);
 
 /**
  * cgroup_css - obtain a cgroup's css for the specified subsystem
@@ -370,14 +369,6 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-/*
- * We don't maintain the lists running through each css_set to its task
- * until after the first call to css_task_iter_start().  This reduces the
- * fork()/exit() overhead for people who have cgroups compiled into their
- * kernel but not actually in use.
- */
-static bool use_task_css_set_links __read_mostly;
-
 static void __put_css_set(struct css_set *cset, int taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
@@ -1307,6 +1298,54 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	return ret;
 }
 
+/*
+ * To reduce the fork() overhead for systems that are not actually using
+ * their cgroups capability, we don't maintain the lists running through
+ * each css_set to its tasks until we see the list actually used - in other
+ * words after the first mount.
+ */
+static bool use_task_css_set_links __read_mostly;
+
+static void cgroup_enable_task_cg_lists(void)
+{
+	struct task_struct *p, *g;
+
+	write_lock(&css_set_lock);
+
+	if (use_task_css_set_links)
+		goto out_unlock;
+
+	use_task_css_set_links = true;
+
+	/*
+	 * We need tasklist_lock because RCU is not safe against
+	 * while_each_thread(). Besides, a forking task that has passed
+	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
+	 * is not guaranteed to have its child immediately visible in the
+	 * tasklist if we walk through it with RCU.
+	 */
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+
+		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
+			     task_css_set(p) != &init_css_set);
+
+		/*
+		 * We should check if the process is exiting, otherwise
+		 * it will race with cgroup_exit() in that the list
+		 * entry won't be deleted though the process has exited.
+		 */
+		if (!(p->flags & PF_EXITING))
+			list_add(&p->cg_list, &task_css_set(p)->tasks);
+
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+out_unlock:
+	write_unlock(&css_set_lock);
+}
+
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	atomic_set(&cgrp->refcnt, 1);
@@ -2364,52 +2403,6 @@ int cgroup_task_count(const struct cgroup *cgrp)
 	return count;
 }
 
-/*
- * To reduce the fork() overhead for systems that are not actually using
- * their cgroups capability, we don't maintain the lists running through
- * each css_set to its tasks until we see the list actually used - in other
- * words after the first mount.
- */
-static void cgroup_enable_task_cg_lists(void)
-{
-	struct task_struct *p, *g;
-
-	write_lock(&css_set_lock);
-
-	if (use_task_css_set_links)
-		goto out_unlock;
-
-	use_task_css_set_links = true;
-
-	/*
-	 * We need tasklist_lock because RCU is not safe against
-	 * while_each_thread(). Besides, a forking task that has passed
-	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
-	 * is not guaranteed to have its child immediately visible in the
-	 * tasklist if we walk through it with RCU.
-	 */
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-
-		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
-			     task_css_set(p) != &init_css_set);
-
-		/*
-		 * We should check if the process is exiting, otherwise
-		 * it will race with cgroup_exit() in that the list
-		 * entry won't be deleted though the process has exited.
-		 */
-		if (!(p->flags & PF_EXITING))
-			list_add(&p->cg_list, &task_css_set(p)->tasks);
-
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-out_unlock:
-	write_unlock(&css_set_lock);
-}
-
 /**
  * css_next_child - find the next child of a given css
  * @pos_css: the current position (%NULL to initiate traversal)
-- 
cgit v1.3-14-g43fede


From 07bc356ed2950048d33d667e933e1b913c6e6b6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:39 -0500
Subject: cgroup: implement cgroup_has_tasks() and unexport cgroup_task_count()

cgroup_task_count() read-locks css_set_lock and walks all tasks to
count them and then returns the result.  The only thing all the users
want is determining whether the cgroup is empty or not.  This patch
implements cgroup_has_tasks() which tests whether cgroup->cset_links
is empty, replaces all cgroup_task_count() usages and unexports it.

Note that the test isn't synchronized.  This is the same as before.
The test has always been racy.

This will help planned css_set locking update.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 8 ++++++--
 kernel/cgroup.c        | 2 +-
 kernel/cpuset.c        | 2 +-
 mm/memcontrol.c        | 4 ++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e2ffcdc26cb7..72154fb44fb5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -457,6 +457,12 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
 	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
 }
 
+/* no synchronization, the result can only be used as a hint */
+static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+{
+	return !list_empty(&cgrp->cset_links);
+}
+
 /* returns ino associated with a cgroup, 0 indicates unmounted root */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
@@ -516,8 +522,6 @@ int cgroup_rm_cftypes(struct cftype *cfts);
 
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
-int cgroup_task_count(const struct cgroup *cgrp);
-
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2469699408bd..ec7746e5ded1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2391,7 +2391,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
  *
  * Return the number of tasks in the cgroup.
  */
-int cgroup_task_count(const struct cgroup *cgrp)
+static int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
 	struct cgrp_cset_link *link;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e97a6e88d036..ae190b0a196a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	 * be changed to have empty cpus_allowed or mems_allowed.
 	 */
 	ret = -ENOSPC;
-	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
+	if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
 		if (!cpumask_empty(cur->cpus_allowed) &&
 		    cpumask_empty(trial->cpus_allowed))
 			goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c1c25494f7ae..d9c6ac1532e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4958,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 	struct cgroup *cgrp = memcg->css.cgroup;
 
 	/* returns EBUSY if there is a task or if we come here twice. */
-	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+	if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
 		return -EBUSY;
 
 	/* we call try-to-free pages for make this cgroup empty */
@@ -5140,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
-	if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+	if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
 		err = -EBUSY;
 	mutex_unlock(&memcg_create_mutex);
 	if (err)
-- 
cgit v1.3-14-g43fede


From e406d1cfff6ab189c8676072d211809c94fecaf0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:39 -0500
Subject: cgroup: reimplement cgroup_transfer_tasks() without using
 css_scan_tasks()

Reimplement cgroup_transfer_tasks() so that it repeatedly fetches the
first task in the cgroup and then tranfers it.  This achieves the same
result without using css_scan_tasks() which is scheduled to be
removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ec7746e5ded1..893b7b502e18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2850,15 +2850,6 @@ int css_scan_tasks(struct cgroup_subsys_state *css,
 	return 0;
 }
 
-static void cgroup_transfer_one_task(struct task_struct *task, void *data)
-{
-	struct cgroup *new_cgroup = data;
-
-	mutex_lock(&cgroup_mutex);
-	cgroup_attach_task(new_cgroup, task, false);
-	mutex_unlock(&cgroup_mutex);
-}
-
 /**
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
@@ -2866,8 +2857,26 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data)
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
-	return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
-			      to, NULL);
+	struct css_task_iter it;
+	struct task_struct *task;
+	int ret = 0;
+
+	do {
+		css_task_iter_start(&from->dummy_css, &it);
+		task = css_task_iter_next(&it);
+		if (task)
+			get_task_struct(task);
+		css_task_iter_end(&it);
+
+		if (task) {
+			mutex_lock(&cgroup_mutex);
+			ret = cgroup_attach_task(to, task, false);
+			mutex_unlock(&cgroup_mutex);
+			put_task_struct(task);
+		}
+	} while (task && !ret);
+
+	return ret;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 96d365e0b86ee7ec6366c99669687e54c9f145e3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:40 -0500
Subject: cgroup: make css_set_lock a rwsem and rename it to css_set_rwsem

Currently there are two ways to walk tasks of a cgroup -
css_task_iter_start/next/end() and css_scan_tasks().  The latter
builds on the former but allows blocking while iterating.
Unfortunately, the way css_scan_tasks() is implemented is rather
nasty, it uses a priority heap of pointers to extract some number of
tasks in task creation order and loops over them invoking the callback
and repeats that until it reaches the end.  It requires either
preallocated heap or may fail under memory pressure, while unlikely to
be problematic, the complexity is O(N^2), and in general just nasty.

We're gonna convert all css_scan_users() to
css_task_iter_start/next/end() and remove css_scan_users().  As
css_scan_tasks() users may block, let's convert css_set_lock to a
rwsem so that tasks can block during css_task_iter_*() is in progress.

While this does increase the chance of possible deadlock scenarios,
given the current usage, the probability is relatively low, and even
if that happens, the right thing to do is updating the iteration in
the similar way to css iterators so that it can handle blocking.

Most conversions are trivial; however, task_cgroup_path() now expects
to be called with css_set_rwsem locked instead of locking itself.
This is because the function is called with RCU read lock held and
rwsem locking should nest outside RCU read lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 104 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 57 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 893b7b502e18..89428b9d9933 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -42,6 +42,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -341,11 +342,10 @@ static struct css_set init_css_set;
 static struct cgrp_cset_link init_cgrp_cset_link;
 
 /*
- * css_set_lock protects the list of css_set objects, and the chain of
- * tasks off each css_set.  Nests outside task->alloc_lock due to
- * css_task_iter_start().
+ * css_set_rwsem protects the list of css_set objects, and the chain of
+ * tasks off each css_set.
  */
-static DEFINE_RWLOCK(css_set_lock);
+static DECLARE_RWSEM(css_set_rwsem);
 static int css_set_count;
 
 /*
@@ -380,9 +380,9 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 	 */
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 	if (!atomic_dec_and_test(&cset->refcount)) {
-		write_unlock(&css_set_lock);
+		up_write(&css_set_rwsem);
 		return;
 	}
 
@@ -396,7 +396,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 
-		/* @cgrp can't go away while we're holding css_set_lock */
+		/* @cgrp can't go away while we're holding css_set_rwsem */
 		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
@@ -406,7 +406,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 		kfree(link);
 	}
 
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 	kfree_rcu(cset, rcu_head);
 }
 
@@ -627,11 +627,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 
 	if (cset)
 		return cset;
@@ -655,7 +655,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -673,7 +673,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	key = css_set_hash(cset->subsys);
 	hash_add(css_set_table, &cset->hlist, key);
 
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 
 	return cset;
 }
@@ -739,14 +739,14 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		kfree(link);
 	}
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -764,7 +764,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex held.
+ * called with cgroup_mutex and css_set_rwsem held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroupfs_root *root)
@@ -772,8 +772,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 	struct css_set *cset;
 	struct cgroup *res = NULL;
 
-	BUG_ON(!mutex_is_locked(&cgroup_mutex));
-	read_lock(&css_set_lock);
+	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&css_set_rwsem);
+
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
 	 * task can't change groups, so the only thing that can happen
@@ -794,7 +795,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 			}
 		}
 	}
-	read_unlock(&css_set_lock);
+
 	BUG_ON(!res);
 	return res;
 }
@@ -1310,7 +1311,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1343,7 +1344,7 @@ static void cgroup_enable_task_cg_lists(void)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1408,7 +1409,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 	root_cgrp->id = ret;
 
 	/*
-	 * We're accessing css_set_count without locking css_set_lock here,
+	 * We're accessing css_set_count without locking css_set_rwsem here,
 	 * but that's OK - it can only be increased by someone holding
 	 * cgroup_lock, and that's us. The worst that can happen is that we
 	 * have some link structures left over
@@ -1451,10 +1452,10 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 	 * Link the top cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 	hash_for_each(css_set_table, i, cset, hlist)
 		link_css_set(&tmp_links, cset, root_cgrp);
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 
 	BUG_ON(!list_empty(&root_cgrp->children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -1617,6 +1618,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
+	down_read(&css_set_rwsem);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -1629,6 +1631,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
+	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -1739,9 +1742,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	task_unlock(tsk);
 
-	write_lock(&css_set_lock);
+	down_write(&css_set_rwsem);
 	list_move(&tsk->cg_list, &new_cset->tasks);
-	write_unlock(&css_set_lock);
+	up_write(&css_set_rwsem);
 
 	/*
 	 * We just gained a reference on old_cset by taking it from the
@@ -1799,6 +1802,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
+	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	do {
 		struct task_and_cgroup ent;
@@ -1826,6 +1830,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 			break;
 	} while_each_thread(leader, tsk);
 	rcu_read_unlock();
+	up_read(&css_set_rwsem);
 	/* remember the number of threads in the array for later. */
 	group_size = i;
 	tset.tc_array = group;
@@ -2003,7 +2008,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 
 	mutex_lock(&cgroup_mutex);
 	for_each_active_root(root) {
-		struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
+		struct cgroup *from_cgrp;
+
+		down_read(&css_set_rwsem);
+		from_cgrp = task_cgroup_from_root(from, root);
+		up_read(&css_set_rwsem);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -2396,10 +2405,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 	return count;
 }
 
@@ -2630,12 +2639,12 @@ static void css_advance_task_iter(struct css_task_iter *it)
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
-	__acquires(css_set_lock)
+	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 
 	it->origin_css = css;
 	it->cset_link = &css->cgroup->cset_links;
@@ -2683,9 +2692,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
-	__releases(css_set_lock)
+	__releases(css_set_rwsem)
 {
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 }
 
 static inline int started_after_time(struct task_struct *t1,
@@ -2735,7 +2744,7 @@ static inline int started_after(void *p1, void *p2)
  *
  * @test may be NULL, meaning always true (select all tasks), which
  * effectively duplicates css_task_iter_{start,next,end}() but does not
- * lock css_set_lock for the call to @process.
+ * lock css_set_rwsem for the call to @process.
  *
  * It is guaranteed that @process will act on every task that is a member
  * of @css for the duration of this call.  This function may or may not
@@ -3867,12 +3876,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	/*
-	 * css_set_lock synchronizes access to ->cset_links and prevents
+	 * css_set_rwsem synchronizes access to ->cset_links and prevents
 	 * @cgrp from being removed while __put_css_set() is in progress.
 	 */
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 	if (!empty)
 		return -EBUSY;
 
@@ -4208,6 +4217,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	retval = 0;
 
 	mutex_lock(&cgroup_mutex);
+	down_read(&css_set_rwsem);
 
 	for_each_active_root(root) {
 		struct cgroup_subsys *ss;
@@ -4233,6 +4243,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	}
 
 out_unlock:
+	up_read(&css_set_rwsem);
 	mutex_unlock(&cgroup_mutex);
 	put_task_struct(tsk);
 out_free:
@@ -4328,12 +4339,12 @@ void cgroup_post_fork(struct task_struct *child)
 	 * lock on fork.
 	 */
 	if (use_task_css_set_links) {
-		write_lock(&css_set_lock);
+		down_write(&css_set_rwsem);
 		task_lock(child);
 		if (list_empty(&child->cg_list))
 			list_add(&child->cg_list, &task_css_set(child)->tasks);
 		task_unlock(child);
-		write_unlock(&css_set_lock);
+		up_write(&css_set_rwsem);
 	}
 
 	/*
@@ -4390,15 +4401,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	int i;
 
 	/*
-	 * Unlink from the css_set task list if necessary.
-	 * Optimistically check cg_list before taking
-	 * css_set_lock
+	 * Unlink from the css_set task list if necessary.  Optimistically
+	 * check cg_list before taking css_set_rwsem.
 	 */
 	if (!list_empty(&tsk->cg_list)) {
-		write_lock(&css_set_lock);
+		down_write(&css_set_rwsem);
 		if (!list_empty(&tsk->cg_list))
 			list_del_init(&tsk->cg_list);
-		write_unlock(&css_set_lock);
+		up_write(&css_set_rwsem);
 	}
 
 	/* Reassign the task to the init_css_set. */
@@ -4650,7 +4660,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -4666,7 +4676,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name);
 	}
 	rcu_read_unlock();
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 	kfree(name_buf);
 	return 0;
 }
@@ -4677,7 +4687,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	read_lock(&css_set_lock);
+	down_read(&css_set_rwsem);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -4693,7 +4703,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 			}
 		}
 	}
-	read_unlock(&css_set_lock);
+	up_read(&css_set_rwsem);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From d66393e54e0a9dc743e440eb36c58bd1158a560e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:40 -0500
Subject: cpuset: use css_task_iter_start/next/end() instead of
 css_scan_tasks()

Now that css_task_iter_start/next_end() supports blocking while
iterating, there's no reason to use css_scan_tasks() which is more
cumbersome to use and scheduled to be removed.

Convert all css_scan_tasks() usages in cpuset to
css_task_iter_start/next/end().  This simplifies the code by removing
heap allocation and callbacks.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cpuset.c | 186 ++++++++++++++++++--------------------------------------
 1 file changed, 58 insertions(+), 128 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ae190b0a196a..65ae0bdf4af8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -828,56 +828,37 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
 	return cs;
 }
 
-/**
- * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
- * @tsk: task to test
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed
- * mask needs to be changed.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_cpumask(struct task_struct *tsk, void *data)
-{
-	struct cpuset *cs = data;
-	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
-
-	set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
-}
-
 /**
  * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
- *
- * Called with cpuset_mutex held
- *
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
  *
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its cpus_allowed to the
+ * effective cpuset's.  As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
  */
-static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_cpumask(struct cpuset *cs)
 {
-	css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap);
+	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+	css_task_iter_end(&it);
 }
 
 /*
  * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
  * @root_cs: the root cpuset of the hierarchy
  * @update_root: update root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
  *
  * This will update cpumasks of tasks in @root_cs and all other empty cpusets
  * which take on cpumask of @root_cs.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_cpumask_hier(struct cpuset *root_cs,
-				      bool update_root, struct ptr_heap *heap)
+static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
 {
 	struct cpuset *cp;
 	struct cgroup_subsys_state *pos_css;
@@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
 			continue;
 		rcu_read_unlock();
 
-		update_tasks_cpumask(cp, heap);
+		update_tasks_cpumask(cp);
 
 		rcu_read_lock();
 		css_put(&cp->css);
@@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs,
 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 			  const char *buf)
 {
-	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
 
@@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		return retval;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-	if (retval)
-		return retval;
-
 	is_load_balanced = is_sched_load_balance(trialcs);
 
 	mutex_lock(&callback_mutex);
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
-	update_tasks_cpumask_hier(cs, true, &heap);
-
-	heap_free(&heap);
+	update_tasks_cpumask_hier(cs, true);
 
 	if (is_load_balanced)
 		rebuild_sched_domains_locked();
@@ -1052,53 +1026,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 	task_unlock(tsk);
 }
 
-struct cpuset_change_nodemask_arg {
-	struct cpuset		*cs;
-	nodemask_t		*newmems;
-};
-
-/*
- * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
- * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
- * memory_migrate flag is set. Called with cpuset_mutex held.
- */
-static void cpuset_change_nodemask(struct task_struct *p, void *data)
-{
-	struct cpuset_change_nodemask_arg *arg = data;
-	struct cpuset *cs = arg->cs;
-	struct mm_struct *mm;
-	int migrate;
-
-	cpuset_change_task_nodemask(p, arg->newmems);
-
-	mm = get_task_mm(p);
-	if (!mm)
-		return;
-
-	migrate = is_memory_migrate(cs);
-
-	mpol_rebind_mm(mm, &cs->mems_allowed);
-	if (migrate)
-		cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems);
-	mmput(mm);
-}
-
 static void *cpuset_being_rebound;
 
 /**
  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
  *
- * Called with cpuset_mutex held.  No return value. It's guaranteed that
- * css_scan_tasks() always returns 0 if @heap != NULL.
+ * Iterate through each task of @cs updating its mems_allowed to the
+ * effective cpuset's.  As this function is called with cpuset_mutex held,
+ * cpuset membership stays stable.
  */
-static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_nodemask(struct cpuset *cs)
 {
 	static nodemask_t newmems;	/* protected by cpuset_mutex */
 	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
-	struct cpuset_change_nodemask_arg arg = { .cs = cs,
-						  .newmems = &newmems };
+	struct css_task_iter it;
+	struct task_struct *task;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
@@ -1114,7 +1057,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap);
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it))) {
+		struct mm_struct *mm;
+		bool migrate;
+
+		cpuset_change_task_nodemask(task, &newmems);
+
+		mm = get_task_mm(task);
+		if (!mm)
+			continue;
+
+		migrate = is_memory_migrate(cs);
+
+		mpol_rebind_mm(mm, &cs->mems_allowed);
+		if (migrate)
+			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
+		mmput(mm);
+	}
+	css_task_iter_end(&it);
 
 	/*
 	 * All the tasks' nodemasks have been updated, update
@@ -1130,15 +1091,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
  * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
  * @cs: the root cpuset of the hierarchy
  * @update_root: update the root cpuset or not?
- * @heap: the heap used by css_scan_tasks()
  *
  * This will update nodemasks of tasks in @root_cs and all other empty cpusets
  * which take on nodemask of @root_cs.
  *
  * Called with cpuset_mutex held
  */
-static void update_tasks_nodemask_hier(struct cpuset *root_cs,
-				       bool update_root, struct ptr_heap *heap)
+static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
 {
 	struct cpuset *cp;
 	struct cgroup_subsys_state *pos_css;
@@ -1159,7 +1118,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs,
 			continue;
 		rcu_read_unlock();
 
-		update_tasks_nodemask(cp, heap);
+		update_tasks_nodemask(cp);
 
 		rcu_read_lock();
 		css_put(&cp->css);
@@ -1184,7 +1143,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
 	int retval;
-	struct ptr_heap heap;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
@@ -1223,17 +1181,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
-	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-	if (retval < 0)
-		goto done;
-
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	update_tasks_nodemask_hier(cs, true, &heap);
-
-	heap_free(&heap);
+	update_tasks_nodemask_hier(cs, true);
 done:
 	return retval;
 }
@@ -1260,39 +1212,23 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
 	return 0;
 }
 
-/**
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
- * @tsk: task to be updated
- * @data: cpuset to @tsk belongs to
- *
- * Called by css_scan_tasks() for each task in a cgroup.
- *
- * We don't need to re-check for the cgroup/cpuset membership, since we're
- * holding cpuset_mutex at this point.
- */
-static void cpuset_change_flag(struct task_struct *tsk, void *data)
-{
-	struct cpuset *cs = data;
-
-	cpuset_update_task_spread_flag(cs, tsk);
-}
-
 /**
  * update_tasks_flags - update the spread flags of tasks in the cpuset.
  * @cs: the cpuset in which each task's spread flags needs to be changed
- * @heap: if NULL, defer allocating heap memory to css_scan_tasks()
- *
- * Called with cpuset_mutex held
  *
- * The css_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- *
- * No return value. It's guaranteed that css_scan_tasks() always returns 0
- * if @heap != NULL.
+ * Iterate through each task of @cs updating its spread flags.  As this
+ * function is called with cpuset_mutex held, cpuset membership stays
+ * stable.
  */
-static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
+static void update_tasks_flags(struct cpuset *cs)
 {
-	css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap);
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, &it);
+	while ((task = css_task_iter_next(&it)))
+		cpuset_update_task_spread_flag(cs, task);
+	css_task_iter_end(&it);
 }
 
 /*
@@ -1310,7 +1246,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	struct cpuset *trialcs;
 	int balance_flag_changed;
 	int spread_flag_changed;
-	struct ptr_heap heap;
 	int err;
 
 	trialcs = alloc_trial_cpuset(cs);
@@ -1326,10 +1261,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	if (err < 0)
 		goto out;
 
-	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-	if (err < 0)
-		goto out;
-
 	balance_flag_changed = (is_sched_load_balance(cs) !=
 				is_sched_load_balance(trialcs));
 
@@ -1344,8 +1275,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 		rebuild_sched_domains_locked();
 
 	if (spread_flag_changed)
-		update_tasks_flags(cs, &heap);
-	heap_free(&heap);
+		update_tasks_flags(cs);
 out:
 	free_trial_cpuset(trialcs);
 	return err;
@@ -2138,7 +2068,7 @@ retry:
 	 */
 	if ((sane && cpumask_empty(cs->cpus_allowed)) ||
 	    (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
-		update_tasks_cpumask(cs, NULL);
+		update_tasks_cpumask(cs);
 
 	mutex_lock(&callback_mutex);
 	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
@@ -2152,7 +2082,7 @@ retry:
 	 */
 	if ((sane && nodes_empty(cs->mems_allowed)) ||
 	    (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
-		update_tasks_nodemask(cs, NULL);
+		update_tasks_nodemask(cs);
 
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		nodes_empty(cs->mems_allowed);
@@ -2214,7 +2144,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = new_mems;
 		mutex_unlock(&callback_mutex);
-		update_tasks_nodemask(&top_cpuset, NULL);
+		update_tasks_nodemask(&top_cpuset);
 	}
 
 	mutex_unlock(&cpuset_mutex);
-- 
cgit v1.3-14-g43fede


From 889ed9ceaa97bb02bf5d7349e24639f7fc5f4fa0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:40 -0500
Subject: cgroup: remove css_scan_tasks()

css_scan_tasks() doesn't have any user left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |   6 --
 kernel/cgroup.c        | 162 -------------------------------------------------
 2 files changed, 168 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 72154fb44fb5..3bd0a7138371 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -14,7 +14,6 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
-#include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
@@ -813,11 +812,6 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
 
-int css_scan_tasks(struct cgroup_subsys_state *css,
-		   bool (*test)(struct task_struct *, void *),
-		   void (*process)(struct task_struct *, void *),
-		   void *data, struct ptr_heap *heap);
-
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 89428b9d9933..05c0c23549f9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2697,168 +2697,6 @@ void css_task_iter_end(struct css_task_iter *it)
 	up_read(&css_set_rwsem);
 }
 
-static inline int started_after_time(struct task_struct *t1,
-				     struct timespec *time,
-				     struct task_struct *t2)
-{
-	int start_diff = timespec_compare(&t1->start_time, time);
-	if (start_diff > 0) {
-		return 1;
-	} else if (start_diff < 0) {
-		return 0;
-	} else {
-		/*
-		 * Arbitrarily, if two processes started at the same
-		 * time, we'll say that the lower pointer value
-		 * started first. Note that t2 may have exited by now
-		 * so this may not be a valid pointer any longer, but
-		 * that's fine - it still serves to distinguish
-		 * between two tasks started (effectively) simultaneously.
-		 */
-		return t1 > t2;
-	}
-}
-
-/*
- * This function is a callback from heap_insert() and is used to order
- * the heap.
- * In this case we order the heap in descending task start time.
- */
-static inline int started_after(void *p1, void *p2)
-{
-	struct task_struct *t1 = p1;
-	struct task_struct *t2 = p2;
-	return started_after_time(t1, &t2->start_time, t2);
-}
-
-/**
- * css_scan_tasks - iterate though all the tasks in a css
- * @css: the css to iterate tasks of
- * @test: optional test callback
- * @process: process callback
- * @data: data passed to @test and @process
- * @heap: optional pre-allocated heap used for task iteration
- *
- * Iterate through all the tasks in @css, calling @test for each, and if it
- * returns %true, call @process for it also.
- *
- * @test may be NULL, meaning always true (select all tasks), which
- * effectively duplicates css_task_iter_{start,next,end}() but does not
- * lock css_set_rwsem for the call to @process.
- *
- * It is guaranteed that @process will act on every task that is a member
- * of @css for the duration of this call.  This function may or may not
- * call @process for tasks that exit or move to a different css during the
- * call, or are forked or move into the css during the call.
- *
- * Note that @test may be called with locks held, and may in some
- * situations be called multiple times for the same task, so it should be
- * cheap.
- *
- * If @heap is non-NULL, a heap has been pre-allocated and will be used for
- * heap operations (and its "gt" member will be overwritten), else a
- * temporary heap will be used (allocation of which may cause this function
- * to fail).
- */
-int css_scan_tasks(struct cgroup_subsys_state *css,
-		   bool (*test)(struct task_struct *, void *),
-		   void (*process)(struct task_struct *, void *),
-		   void *data, struct ptr_heap *heap)
-{
-	int retval, i;
-	struct css_task_iter it;
-	struct task_struct *p, *dropped;
-	/* Never dereference latest_task, since it's not refcounted */
-	struct task_struct *latest_task = NULL;
-	struct ptr_heap tmp_heap;
-	struct timespec latest_time = { 0, 0 };
-
-	if (heap) {
-		/* The caller supplied our heap and pre-allocated its memory */
-		heap->gt = &started_after;
-	} else {
-		/* We need to allocate our own heap memory */
-		heap = &tmp_heap;
-		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
-		if (retval)
-			/* cannot allocate the heap */
-			return retval;
-	}
-
- again:
-	/*
-	 * Scan tasks in the css, using the @test callback to determine
-	 * which are of interest, and invoking @process callback on the
-	 * ones which need an update.  Since we don't want to hold any
-	 * locks during the task updates, gather tasks to be processed in a
-	 * heap structure.  The heap is sorted by descending task start
-	 * time.  If the statically-sized heap fills up, we overflow tasks
-	 * that started later, and in future iterations only consider tasks
-	 * that started after the latest task in the previous pass. This
-	 * guarantees forward progress and that we don't miss any tasks.
-	 */
-	heap->size = 0;
-	css_task_iter_start(css, &it);
-	while ((p = css_task_iter_next(&it))) {
-		/*
-		 * Only affect tasks that qualify per the caller's callback,
-		 * if he provided one
-		 */
-		if (test && !test(p, data))
-			continue;
-		/*
-		 * Only process tasks that started after the last task
-		 * we processed
-		 */
-		if (!started_after_time(p, &latest_time, latest_task))
-			continue;
-		dropped = heap_insert(heap, p);
-		if (dropped == NULL) {
-			/*
-			 * The new task was inserted; the heap wasn't
-			 * previously full
-			 */
-			get_task_struct(p);
-		} else if (dropped != p) {
-			/*
-			 * The new task was inserted, and pushed out a
-			 * different task
-			 */
-			get_task_struct(p);
-			put_task_struct(dropped);
-		}
-		/*
-		 * Else the new task was newer than anything already in
-		 * the heap and wasn't inserted
-		 */
-	}
-	css_task_iter_end(&it);
-
-	if (heap->size) {
-		for (i = 0; i < heap->size; i++) {
-			struct task_struct *q = heap->ptrs[i];
-			if (i == 0) {
-				latest_time = q->start_time;
-				latest_task = q;
-			}
-			/* Process the task per the caller's callback */
-			process(q, data);
-			put_task_struct(q);
-		}
-		/*
-		 * If we had to process any tasks at all, scan again
-		 * in case some of them were in the middle of forking
-		 * children that didn't get processed.
-		 * Not the most efficient way to do it, but it avoids
-		 * having to take callback_mutex in the fork path
-		 */
-		goto again;
-	}
-	if (heap == &tmp_heap)
-		heap_free(&tmp_heap);
-	return 0;
-}
-
 /**
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
-- 
cgit v1.3-14-g43fede


From 89c5509b0d71d1609761bf72d33333ab206dac9f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:40 -0500
Subject: cgroup: separate out put_css_set_locked() and remove
 put_css_set_taskexit()

put_css_set() is performed in two steps - it first tries to put
without grabbing css_set_rwsem if such put wouldn't make the count
zero.  If that fails, it puts after write-locking css_set_rwsem.  This
patch separates out the second phase into put_css_set_locked() which
should be called with css_set_rwsem locked.

Also, put_css_set_taskexit() is droped and put_css_set() is made to
take @taskexit.  There are only a handful users of these functions.
No point in providing different variants.

put_css_locked() will be used by later changes.  This patch doesn't
introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 50 +++++++++++++++++++++++---------------------------
 1 file changed, 23 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 05c0c23549f9..17b10b8efbcf 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -369,22 +369,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 	return key;
 }
 
-static void __put_css_set(struct css_set *cset, int taskexit)
+static void put_css_set_locked(struct css_set *cset, bool taskexit)
 {
 	struct cgrp_cset_link *link, *tmp_link;
 
-	/*
-	 * Ensure that the refcount doesn't hit zero while any readers
-	 * can see it. Similar to atomic_dec_and_lock(), but for an
-	 * rwlock
-	 */
-	if (atomic_add_unless(&cset->refcount, -1, 1))
-		return;
-	down_write(&css_set_rwsem);
-	if (!atomic_dec_and_test(&cset->refcount)) {
-		up_write(&css_set_rwsem);
+	lockdep_assert_held(&css_set_rwsem);
+
+	if (!atomic_dec_and_test(&cset->refcount))
 		return;
-	}
 
 	/* This css_set is dead. unlink it and release cgroup refcounts */
 	hash_del(&cset->hlist);
@@ -406,10 +398,24 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 		kfree(link);
 	}
 
-	up_write(&css_set_rwsem);
 	kfree_rcu(cset, rcu_head);
 }
 
+static void put_css_set(struct css_set *cset, bool taskexit)
+{
+	/*
+	 * Ensure that the refcount doesn't hit zero while any readers
+	 * can see it. Similar to atomic_dec_and_lock(), but for an
+	 * rwlock
+	 */
+	if (atomic_add_unless(&cset->refcount, -1, 1))
+		return;
+
+	down_write(&css_set_rwsem);
+	put_css_set_locked(cset, taskexit);
+	up_write(&css_set_rwsem);
+}
+
 /*
  * refcounted get/put for css_set objects
  */
@@ -418,16 +424,6 @@ static inline void get_css_set(struct css_set *cset)
 	atomic_inc(&cset->refcount);
 }
 
-static inline void put_css_set(struct css_set *cset)
-{
-	__put_css_set(cset, 0);
-}
-
-static inline void put_css_set_taskexit(struct css_set *cset)
-{
-	__put_css_set(cset, 1);
-}
-
 /**
  * compare_css_sets - helper function for find_existing_css_set().
  * @cset: candidate css_set being tested
@@ -1752,7 +1748,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	 * we're safe to drop it here; it will be freed under RCU.
 	 */
 	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
-	put_css_set(old_cset);
+	put_css_set(old_cset, false);
 }
 
 /**
@@ -1898,7 +1894,7 @@ out_put_css_set_refs:
 			tc = flex_array_get(group, i);
 			if (!tc->cset)
 				break;
-			put_css_set(tc->cset);
+			put_css_set(tc->cset, false);
 		}
 	}
 out_cancel_attach:
@@ -3715,7 +3711,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	/*
 	 * css_set_rwsem synchronizes access to ->cset_links and prevents
-	 * @cgrp from being removed while __put_css_set() is in progress.
+	 * @cgrp from being removed while put_css_set() is in progress.
 	 */
 	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
@@ -4267,7 +4263,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	}
 	task_unlock(tsk);
 
-	put_css_set_taskexit(cset);
+	put_css_set(cset, true);
 }
 
 static void check_for_release(struct cgroup *cgrp)
-- 
cgit v1.3-14-g43fede


From cb0f1fe9ba47c202a98a9d41ad5c12c0ac7732e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:41 -0500
Subject: cgroup: move css_set_rwsem locking outside of cgroup_task_migrate()

Instead of repeatedly locking and unlocking css_set_rwsem inside
cgroup_task_migrate(), update cgroup_attach_task() to grab it outside
of the loop and update cgroup_task_migrate() to use
put_css_set_locked().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 17b10b8efbcf..704c590a81d7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1715,10 +1715,13 @@ int cgroup_taskset_size(struct cgroup_taskset *tset)
 EXPORT_SYMBOL_GPL(cgroup_taskset_size);
 
 
-/*
+/**
  * cgroup_task_migrate - move a task from one cgroup to another.
+ * @old_cgrp; the cgroup @tsk is being migrated from
+ * @tsk: the task being migrated
+ * @new_cset: the new css_set @tsk is being attached to
  *
- * Must be called with cgroup_mutex and threadgroup locked.
+ * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
  */
 static void cgroup_task_migrate(struct cgroup *old_cgrp,
 				struct task_struct *tsk,
@@ -1726,6 +1729,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 {
 	struct css_set *old_cset;
 
+	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&css_set_rwsem);
+
 	/*
 	 * We are synchronized through threadgroup_lock() against PF_EXITING
 	 * setting such that we can't race against cgroup_exit() changing the
@@ -1738,9 +1744,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	task_unlock(tsk);
 
-	down_write(&css_set_rwsem);
 	list_move(&tsk->cg_list, &new_cset->tasks);
-	up_write(&css_set_rwsem);
 
 	/*
 	 * We just gained a reference on old_cset by taking it from the
@@ -1748,7 +1752,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	 * we're safe to drop it here; it will be freed under RCU.
 	 */
 	set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
-	put_css_set(old_cset, false);
+	put_css_set_locked(old_cset, false);
 }
 
 /**
@@ -1871,10 +1875,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 * proceed to move all tasks to the new cgroup.  There are no
 	 * failure cases after here, so this is the commit point.
 	 */
+	down_write(&css_set_rwsem);
 	for (i = 0; i < group_size; i++) {
 		tc = flex_array_get(group, i);
 		cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
 	}
+	up_write(&css_set_rwsem);
 	/* nothing is sensitive to fork() after this point. */
 
 	/*
-- 
cgit v1.3-14-g43fede


From 924f0d9a2078f49ff331bb43196ec5afadc16b8f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:41 -0500
Subject: cgroup: drop @skip_css from cgroup_taskset_for_each()

If !NULL, @skip_css makes cgroup_taskset_for_each() skip the matching
css.  The intention of the interface is to make it easy to skip css's
(cgroup_subsys_states) which already match the migration target;
however, this is entirely unnecessary as migration taskset doesn't
include tasks which are already in the target cgroup.  Drop @skip_css
from cgroup_taskset_for_each().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
---
 block/blk-cgroup.c           | 2 +-
 include/linux/cgroup.h       | 8 ++------
 kernel/cgroup_freezer.c      | 2 +-
 kernel/cpuset.c              | 4 ++--
 kernel/events/core.c         | 2 +-
 kernel/sched/core.c          | 4 ++--
 net/core/netclassid_cgroup.c | 2 +-
 net/core/netprio_cgroup.c    | 2 +-
 8 files changed, 11 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1cef07cf9c21..4aefd46d7d95 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 	int ret = 0;
 
 	/* task_lock() is needed to avoid races with exit_io_context() */
-	cgroup_taskset_for_each(task, css, tset) {
+	cgroup_taskset_for_each(task, tset) {
 		task_lock(task);
 		ioc = task->io_context;
 		if (ioc && atomic_read(&ioc->nr_tasks) > 1)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3bd0a7138371..581a124c7bc8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -535,15 +535,11 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
- * @skip_css: skip if task's css matches this, %NULL to iterate through all
  * @tset: taskset to iterate
  */
-#define cgroup_taskset_for_each(task, skip_css, tset)			\
+#define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
-	     (task) = cgroup_taskset_next((tset)))			\
-		if (!(skip_css) ||					\
-		    cgroup_taskset_cur_css((tset),			\
-			(skip_css)->ss->id) != (skip_css))
+	     (task) = cgroup_taskset_next((tset)))
 
 /*
  * Control Group subsystem type.
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 98ea26a99076..7201a637c405 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -187,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 	 * current state before executing the following - !frozen tasks may
 	 * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
 	 */
-	cgroup_taskset_for_each(task, new_css, tset) {
+	cgroup_taskset_for_each(task, tset) {
 		if (!(freezer->state & CGROUP_FREEZING)) {
 			__thaw_task(task);
 		} else {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 65ae0bdf4af8..bf20e4ac2f75 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1398,7 +1398,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
 	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
 		goto out_unlock;
 
-	cgroup_taskset_for_each(task, css, tset) {
+	cgroup_taskset_for_each(task, tset) {
 		/*
 		 * Kthreads which disallow setaffinity shouldn't be moved
 		 * to a new cpuset; we don't want to change their cpu
@@ -1467,7 +1467,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 
 	guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
 
-	cgroup_taskset_for_each(task, css, tset) {
+	cgroup_taskset_for_each(task, tset) {
 		/*
 		 * can_attach beforehand should guarantee that this doesn't
 		 * fail.  TODO: have a better way to handle failure here
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a3c3ab50271a..6dd714955b04 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8021,7 +8021,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
 {
 	struct task_struct *task;
 
-	cgroup_taskset_for_each(task, css, tset)
+	cgroup_taskset_for_each(task, tset)
 		task_function_call(task, __perf_cgroup_move, task);
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d4cfc5561830..ba386a06ab11 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7600,7 +7600,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
 {
 	struct task_struct *task;
 
-	cgroup_taskset_for_each(task, css, tset) {
+	cgroup_taskset_for_each(task, tset) {
 #ifdef CONFIG_RT_GROUP_SCHED
 		if (!sched_rt_can_attach(css_tg(css), task))
 			return -EINVAL;
@@ -7618,7 +7618,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
 {
 	struct task_struct *task;
 
-	cgroup_taskset_for_each(task, css, tset)
+	cgroup_taskset_for_each(task, tset)
 		sched_move_task(task);
 }
 
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index b865662fba71..22931e1b99b4 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css,
 	void *v = (void *)(unsigned long)cs->classid;
 	struct task_struct *p;
 
-	cgroup_taskset_for_each(p, css, tset) {
+	cgroup_taskset_for_each(p, tset) {
 		task_lock(p);
 		iterate_fd(p->files, 0, update_classid, v);
 		task_unlock(p);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index d7d23e28fafd..f9f3a40d3350 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css,
 	struct task_struct *p;
 	void *v = (void *)(unsigned long)css->cgroup->id;
 
-	cgroup_taskset_for_each(p, css, tset) {
+	cgroup_taskset_for_each(p, tset) {
 		task_lock(p);
 		iterate_fd(p->files, 0, update_netprio, v);
 		task_unlock(p);
-- 
cgit v1.3-14-g43fede


From 57fce0a68e3aa71d223d9023aae66c7393970c34 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:41 -0500
Subject: cpuset: don't use cgroup_taskset_cur_css()

cgroup_taskset_cur_css() will be removed during the planned
resturcturing of migration path.  The only use of
cgroup_taskset_cur_css() is finding out the old cgroup_subsys_state of
the leader in cpuset_attach().  This usage can easily be removed by
remembering the old value from cpuset_can_attach().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index bf20e4ac2f75..d8bec21d7a11 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1379,6 +1379,8 @@ static int fmeter_getrate(struct fmeter *fmp)
 	return val;
 }
 
+static struct cpuset *cpuset_attach_old_cs;
+
 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
 static int cpuset_can_attach(struct cgroup_subsys_state *css,
 			     struct cgroup_taskset *tset)
@@ -1387,6 +1389,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
 	struct task_struct *task;
 	int ret;
 
+	/* used later by cpuset_attach() */
+	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+
 	mutex_lock(&cpuset_mutex);
 
 	/*
@@ -1450,10 +1455,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	struct mm_struct *mm;
 	struct task_struct *task;
 	struct task_struct *leader = cgroup_taskset_first(tset);
-	struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset,
-							cpuset_cgrp_id);
 	struct cpuset *cs = css_cs(css);
-	struct cpuset *oldcs = css_cs(oldcss);
+	struct cpuset *oldcs = cpuset_attach_old_cs;
 	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
 	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
 
-- 
cgit v1.3-14-g43fede


From bc668c7519ff8b4681af80e92f463bec7bf7cf9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:43 -0500
Subject: cgroup: remove cgroup_taskset_cur_css() and cgroup_taskset_size()

The two functions don't have any users left.  Remove them along with
cgroup_taskset->cur_cgrp.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 30 ------------------------------
 2 files changed, 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 581a124c7bc8..ef0b3af0e61c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -528,9 +528,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 struct cgroup_taskset;
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
-						   int subsys_id);
-int cgroup_taskset_size(struct cgroup_taskset *tset);
 
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 704c590a81d7..a9d9bbb12310 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1647,7 +1647,6 @@ struct cgroup_taskset {
 	struct flex_array	*tc_array;
 	int			tc_array_len;
 	int			idx;
-	struct cgroup		*cur_cgrp;
 };
 
 /**
@@ -1662,7 +1661,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 		tset->idx = 0;
 		return cgroup_taskset_next(tset);
 	} else {
-		tset->cur_cgrp = tset->single.cgrp;
 		return tset->single.task;
 	}
 }
@@ -1683,38 +1681,10 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 		return NULL;
 
 	tc = flex_array_get(tset->tc_array, tset->idx++);
-	tset->cur_cgrp = tc->cgrp;
 	return tc->task;
 }
 EXPORT_SYMBOL_GPL(cgroup_taskset_next);
 
-/**
- * cgroup_taskset_cur_css - return the matching css for the current task
- * @tset: taskset of interest
- * @subsys_id: the ID of the target subsystem
- *
- * Return the css for the current (last returned) task of @tset for
- * subsystem specified by @subsys_id.  This function must be preceded by
- * either cgroup_taskset_first() or cgroup_taskset_next().
- */
-struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset,
-						   int subsys_id)
-{
-	return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]);
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css);
-
-/**
- * cgroup_taskset_size - return the number of tasks in taskset
- * @tset: taskset of interest
- */
-int cgroup_taskset_size(struct cgroup_taskset *tset)
-{
-	return tset->tc_array ? tset->tc_array_len : 1;
-}
-EXPORT_SYMBOL_GPL(cgroup_taskset_size);
-
-
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
  * @old_cgrp; the cgroup @tsk is being migrated from
-- 
cgit v1.3-14-g43fede


From 9db8de3722d184b8a431afd6bef803d6867ac889 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:43 -0500
Subject: cgroup: cosmetic updates to cgroup_attach_task()

cgroup_attach_task() is planned to go through restructuring.  Let's
tidy it up a bit in preparation.

* Update cgroup_attach_task() to receive the target task argument in
  @leader instead of @tsk.

* Rename @tsk to @task.

* Rename @retval to @ret.

This is purely cosmetic.

v2: get_nr_threads() was using uninitialized @task instead of @leader.
    Fixed.  Reported by Dan Carpenter.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
---
 kernel/cgroup.c | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a9d9bbb12310..9a890a2e58fc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1728,20 +1728,20 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 /**
  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
- * @tsk: the task or the leader of the threadgroup to be attached
+ * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
  * task_lock of @tsk or each thread in the threadgroup individually in turn.
  */
-static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 			      bool threadgroup)
 {
-	int retval, i, group_size;
+	int ret, i, group_size;
 	struct cgroupfs_root *root = cgrp->root;
 	struct cgroup_subsys_state *css, *failed_css = NULL;
 	/* threadgroup list cursor and array */
-	struct task_struct *leader = tsk;
+	struct task_struct *task;
 	struct task_and_cgroup *tc;
 	struct flex_array *group;
 	struct cgroup_taskset tset = { };
@@ -1754,7 +1754,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 * threads exit, this will just be an over-estimate.
 	 */
 	if (threadgroup)
-		group_size = get_nr_threads(tsk);
+		group_size = get_nr_threads(leader);
 	else
 		group_size = 1;
 	/* flex_array supports very large thread-groups better than kmalloc. */
@@ -1762,8 +1762,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	if (!group)
 		return -ENOMEM;
 	/* pre-allocate to guarantee space while iterating in rcu read-side. */
-	retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
-	if (retval)
+	ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
+	if (ret)
 		goto out_free_group_list;
 
 	i = 0;
@@ -1774,17 +1774,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 */
 	down_read(&css_set_rwsem);
 	rcu_read_lock();
+	task = leader;
 	do {
 		struct task_and_cgroup ent;
 
-		/* @tsk either already exited or can't exit until the end */
-		if (tsk->flags & PF_EXITING)
+		/* @task either already exited or can't exit until the end */
+		if (task->flags & PF_EXITING)
 			goto next;
 
 		/* as per above, nr_threads may decrease, but not increase. */
 		BUG_ON(i >= group_size);
-		ent.task = tsk;
-		ent.cgrp = task_cgroup_from_root(tsk, root);
+		ent.task = task;
+		ent.cgrp = task_cgroup_from_root(task, root);
 		/* nothing to do if this task is already in the cgroup */
 		if (ent.cgrp == cgrp)
 			goto next;
@@ -1792,13 +1793,13 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 		 * saying GFP_ATOMIC has no effect here because we did prealloc
 		 * earlier, but it's good form to communicate our expectations.
 		 */
-		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
-		BUG_ON(retval != 0);
+		ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
+		BUG_ON(ret != 0);
 		i++;
 	next:
 		if (!threadgroup)
 			break;
-	} while_each_thread(leader, tsk);
+	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
 	/* remember the number of threads in the array for later. */
@@ -1807,7 +1808,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	tset.tc_array_len = group_size;
 
 	/* methods shouldn't be called if no task is actually migrating */
-	retval = 0;
+	ret = 0;
 	if (!group_size)
 		goto out_free_group_list;
 
@@ -1816,8 +1817,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	 */
 	for_each_css(css, i, cgrp) {
 		if (css->ss->can_attach) {
-			retval = css->ss->can_attach(css, &tset);
-			if (retval) {
+			ret = css->ss->can_attach(css, &tset);
+			if (ret) {
 				failed_css = css;
 				goto out_cancel_attach;
 			}
@@ -1835,7 +1836,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 		old_cset = task_css_set(tc->task);
 		tc->cset = find_css_set(old_cset, cgrp);
 		if (!tc->cset) {
-			retval = -ENOMEM;
+			ret = -ENOMEM;
 			goto out_put_css_set_refs;
 		}
 	}
@@ -1863,9 +1864,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 	/*
 	 * step 5: success! and cleanup
 	 */
-	retval = 0;
+	ret = 0;
 out_put_css_set_refs:
-	if (retval) {
+	if (ret) {
 		for (i = 0; i < group_size; i++) {
 			tc = flex_array_get(group, i);
 			if (!tc->cset)
@@ -1874,7 +1875,7 @@ out_put_css_set_refs:
 		}
 	}
 out_cancel_attach:
-	if (retval) {
+	if (ret) {
 		for_each_css(css, i, cgrp) {
 			if (css == failed_css)
 				break;
@@ -1884,7 +1885,7 @@ out_cancel_attach:
 	}
 out_free_group_list:
 	flex_array_free(group);
-	return retval;
+	return ret;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 8541fecc04a91842f023cbfe2c376d4de3b5047e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 06:58:43 -0500
Subject: cgroup: unexport functions

With module support gone, a lot of functions no longer need to be
exported.  Unexport them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a890a2e58fc..750d0e1e7e56 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -242,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 	}
 	return false;
 }
-EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
@@ -1664,7 +1663,6 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 		return tset->single.task;
 	}
 }
-EXPORT_SYMBOL_GPL(cgroup_taskset_first);
 
 /**
  * cgroup_taskset_next - iterate to the next task in taskset
@@ -1683,7 +1681,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 	tc = flex_array_get(tset->tc_array, tset->idx++);
 	return tc->task;
 }
-EXPORT_SYMBOL_GPL(cgroup_taskset_next);
 
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
@@ -2365,7 +2362,6 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	mutex_unlock(&cgroup_tree_mutex);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
@@ -2439,7 +2435,6 @@ css_next_child(struct cgroup_subsys_state *pos_css,
 
 	return cgroup_css(next, parent_css->ss);
 }
-EXPORT_SYMBOL_GPL(css_next_child);
 
 /**
  * css_next_descendant_pre - find the next descendant for pre-order walk
@@ -2482,7 +2477,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 
 /**
  * css_rightmost_descendant - return the rightmost descendant of a css
@@ -2514,7 +2508,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos)
 
 	return last;
 }
-EXPORT_SYMBOL_GPL(css_rightmost_descendant);
 
 static struct cgroup_subsys_state *
 css_leftmost_descendant(struct cgroup_subsys_state *pos)
@@ -2568,7 +2561,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos,
 	/* no sibling left, visit parent */
 	return css_parent(pos);
 }
-EXPORT_SYMBOL_GPL(css_next_descendant_post);
 
 /**
  * css_advance_task_iter - advance a task itererator to the next css_set
-- 
cgit v1.3-14-g43fede


From dd5fd9b91a77b4c9c28b7ef9c181b1a875820d0a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 11 Feb 2014 14:35:40 +0100
Subject: tick: Clear broadcast pending bit when switching to oneshot

AMD systems which use the C1E workaround in the amd_e400_idle routine
trigger the WARN_ON_ONCE in the broadcast code when onlining a CPU.

The reason is that the idle routine of those AMD systems switches the
cpu into forced broadcast mode early on before the newly brought up
CPU can switch over to high resolution / NOHZ mode. The timer related
CPU1 bringup looks like this:

  clockevent_register_device(local_apic);
  tick_setup(local_apic);
  ...
  idle()
	tick_broadcast_on_off(FORCE);
	tick_broadcast_oneshot_control(ENTER)
	  cpumask_set(cpu, broadcast_oneshot_mask);
	halt();

Now the broadcast interrupt on CPU0 sets CPU1 in the
broadcast_pending_mask and wakes CPU1. So CPU1 continues:

	local_apic_timer_interrupt()
	   tick_handle_periodic();
	   softirq()
	     tick_init_highres();
	       cpumask_clr(cpu, broadcast_oneshot_mask);

	tick_broadcast_oneshot_control(ENTER)
	   WARN_ON(cpumask_test(cpu, broadcast_pending_mask);

So while we remove CPU1 from the broadcast_oneshot_mask when we switch
over to highres mode, we do not clear the pending bit, which then
triggers the warning when we go back to idle.

The reason why this is only visible on C1E affected AMD systems is
that the other machines enter the deep sleep states via
acpi_idle/intel_idle and exit the broadcast mode before executing the
remote triggered local_apic_timer_interrupt. So the pending bit is
already cleared when the switch over to highres mode is clearing the
oneshot mask.

The solution is simple: Clear the pending bit together with the mask
bit when we switch over to highres mode.

Stanislaw came up independently with the same patch by enforcing the
C1E workaround and debugging the fallout. I picked mine, because mine
has a changelog :)

Reported-by: poma <pomidorabelisima@gmail.com>
Debugged-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Olaf Hering <olaf@aepfle.de>
Cc: Dave Jones <davej@redhat.com>
Cc: Justin M. Forbes <jforbes@redhat.com>
Cc: Josh Boyer <jwboyer@redhat.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1402111434180.21991@ionos.tec.linutronix.de
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 43780ab5e279..98977a57ac72 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -756,6 +756,7 @@ out:
 static void tick_broadcast_clear_oneshot(int cpu)
 {
 	cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+	cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
 }
 
 static void tick_broadcast_init_next_event(struct cpumask *mask,
-- 
cgit v1.3-14-g43fede


From 430af8ad9dad82d775d688155e1db1da385d3e7a Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Thu, 13 Feb 2014 16:42:43 -0500
Subject: cgroup: fix coccinelle warnings

kernel/cgroup.c:2256:1-3: WARNING: PTR_RET can be used

 Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR

Generated by: coccinelle/api/ptr_ret.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 750d0e1e7e56..15dcae74b510 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2171,9 +2171,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
 				  NULL, false, key);
-	if (IS_ERR(kn))
-		return PTR_ERR(kn);
-	return 0;
+	return PTR_ERR_OR_ZERO(kn);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 40747ffa5aa8d5b99ca46c696234b9194b59e0ac Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:51:59 +0100
Subject: asmlinkage: Make jiffies visible

Jiffies is referenced by the linker script, so it has to be visible.

Handled both the generic and the x86 version.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-3-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/time.c | 2 +-
 kernel/timer.c         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 24d3c91e9812..6ec91c00d84d 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
 #include <asm/time.h>
 
 #ifdef CONFIG_X86_64
-DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
+__visible DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
 #endif
 
 unsigned long profile_pc(struct pt_regs *regs)
diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..d78de047599b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -52,7 +52,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
 
-- 
cgit v1.3-14-g43fede


From 63f9a7fde715352e0769302527670542a664b981 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:01 +0100
Subject: asmlinkage: Make lockdep_sys_exit asmlinkage

lockdep_sys_exit can be called from assembler code, so make it
asmlinkage.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-5-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 include/linux/lockdep.h  | 2 +-
 kernel/locking/lockdep.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 92b1bfc5da60..7df9aa6902c0 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -265,7 +265,7 @@ extern void lockdep_info(void);
 extern void lockdep_reset(void);
 extern void lockdep_reset_lock(struct lockdep_map *lock);
 extern void lockdep_free_key_range(void *start, unsigned long size);
-extern void lockdep_sys_exit(void);
+extern asmlinkage void lockdep_sys_exit(void);
 
 extern void lockdep_off(void);
 extern void lockdep_on(void);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa0..c8b6753c5bb1 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4191,7 +4191,7 @@ void debug_show_held_locks(struct task_struct *task)
 }
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
-void lockdep_sys_exit(void)
+asmlinkage void lockdep_sys_exit(void)
 {
 	struct task_struct *curr = current;
 
-- 
cgit v1.3-14-g43fede


From b35f8305339f1ba3070fe606c6ef0d86ef093dee Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:02 +0100
Subject: asmlinkage: Make trace_hardirq visible

Can be called from assembler code.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-6-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/locking/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c8b6753c5bb1..aa3bf153b718 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2557,7 +2557,7 @@ static void __trace_hardirqs_on_caller(unsigned long ip)
 	debug_atomic_inc(hardirqs_on_events);
 }
 
-void trace_hardirqs_on_caller(unsigned long ip)
+__visible void trace_hardirqs_on_caller(unsigned long ip)
 {
 	time_hardirqs_on(CALLER_ADDR0, ip);
 
@@ -2610,7 +2610,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off_caller(unsigned long ip)
+__visible void trace_hardirqs_off_caller(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
-- 
cgit v1.3-14-g43fede


From 22d9fd3411c693ccae5f5c2280fb1f9bb106ad4f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:03 +0100
Subject: asmlinkage, mutex: Mark __visible

Various kernel/mutex.c functions can be called from
inline assembler, so they should be all global and
__visible.

Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-7-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/locking/mutex.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219de..adbc0d0f314b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -67,8 +67,7 @@ EXPORT_SYMBOL(__mutex_init);
  * We also put the fastpath first in the kernel image, to make sure the
  * branch is predicted by the CPU as default-untaken.
  */
-static __used noinline void __sched
-__mutex_lock_slowpath(atomic_t *lock_count);
+__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
 
 /**
  * mutex_lock - acquire the mutex
@@ -225,7 +224,8 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 }
 #endif
 
-static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
+__visible __used noinline
+void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /**
  * mutex_unlock - release the mutex
@@ -746,7 +746,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 /*
  * Release the lock, slowpath:
  */
-static __used noinline void
+__visible void
 __mutex_unlock_slowpath(atomic_t *lock_count)
 {
 	__mutex_unlock_common_slowpath(lock_count, 1);
@@ -803,7 +803,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_lock_killable);
 
-static __used noinline void __sched
+__visible void __sched
 __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
-- 
cgit v1.3-14-g43fede


From 00b7103078596a243c16239004e0dc9416910f13 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:04 +0100
Subject: asmlinkage: Make main_extable_sort_needed visible

main_extable_sort_needed is used by the build system and needs
to be a normal ELF symbol. Make it visible so that LTO
does not remove or mangle it.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-8-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/extable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 763faf037ec1..d8a6446adbcb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -36,7 +36,7 @@ extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
 
 /* Cleared by build time tools if the table is already sorted. */
-u32 __initdata main_extable_sort_needed = 1;
+u32 __initdata __visible main_extable_sort_needed = 1;
 
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
-- 
cgit v1.3-14-g43fede


From 3ebae4f3a2e746ae17f25c741e249294e7d6d7c2 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:05 +0100
Subject: asmlinkage: Mark rwsem functions that can be called from assembler
 asmlinkage

Mark the rwsem functions that can be called from assembler asmlinkage.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-9-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/locking/rwsem-xadd.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 19c5fa95e0b4..1d66e08e897d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -143,6 +143,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
 /*
  * wait for the read lock to be granted
  */
+__visible
 struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
@@ -190,6 +191,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait until we successfully acquire the write lock
  */
+__visible
 struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
@@ -252,6 +254,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
  * handle waking up a waiter on the semaphore
  * - up_read/up_write has decremented the active part of count if we come here
  */
+__visible
 struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
@@ -272,6 +275,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
  * - caller incremented waiting part of count and discovered it still negative
  * - just wake up any readers at the front of the queue
  */
+__visible
 struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
 {
 	unsigned long flags;
-- 
cgit v1.3-14-g43fede


From a7330c997d0f74d909a7d3553b1d550d8be2b61a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:06 +0100
Subject: asmlinkage Make __stack_chk_failed and memcmp visible

In LTO symbols implicitely referenced by the compiler need
to be visible. Earlier these symbols were visible implicitely
from being exported, but we disabled implicit visibility fo
 EXPORTs when modules are disabled to improve code size. So
now these symbols have to be marked visible explicitely.

Do this for __stack_chk_fail (with stack protector)
and memcmp.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-10-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/panic.c | 2 +-
 lib/string.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..3eb0ffb25960 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -459,7 +459,7 @@ EXPORT_SYMBOL(warn_slowpath_null);
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
-void __stack_chk_fail(void)
+__visible void __stack_chk_fail(void)
 {
 	panic("stack-protector: Kernel stack is corrupted in: %p\n",
 		__builtin_return_address(0));
diff --git a/lib/string.c b/lib/string.c
index e5878de4f101..9b1f9062a202 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -648,7 +648,7 @@ EXPORT_SYMBOL(memmove);
  * @count: The size of the area.
  */
 #undef memcmp
-int memcmp(const void *cs, const void *ct, size_t count)
+__visible int memcmp(const void *cs, const void *ct, size_t count)
 {
 	const unsigned char *su1, *su2;
 	int res = 0;
-- 
cgit v1.3-14-g43fede


From 285c00adf651c9b1d6c73d5eee482d2a617a64c1 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 08:52:08 +0100
Subject: asmlinkage: Make trace_hardirqs_on/off_caller visible

These functions are called from assembler, and thus need to be
__visible.

Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391845930-28580-12-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/trace/trace_irqsoff.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..887ef88b0bc7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -498,14 +498,14 @@ void trace_hardirqs_off(void)
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
-void trace_hardirqs_on_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-void trace_hardirqs_off_caller(unsigned long caller_addr)
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
-- 
cgit v1.3-14-g43fede


From 80375980f1608f43b47abc2671456b23ec68c434 Mon Sep 17 00:00:00 2001
From: Joe Mario <jmario@redhat.com>
Date: Sat, 8 Feb 2014 09:01:09 +0100
Subject: lto: Handle LTO common symbols in module loader

Here is the workaround I made for having the kernel not reject modules
built with -flto.  The clean solution would be to get the compiler to not
emit the symbol.  Or if it has to emit the symbol, then emit it as
initialized data but put it into a comdat/linkonce section.

Minor tweaks by AK over Joe's patch.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391846481-31491-5-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/module.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..b99e80119eef 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1948,6 +1948,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
 
 		switch (sym[i].st_shndx) {
 		case SHN_COMMON:
+			/* Ignore common symbols */
+			if (!strncmp(name, "__gnu_lto", 9))
+				break;
+
 			/* We compiled with -fno-common.  These are not
 			   supposed to happen.  */
 			pr_debug("Common symbol: %s\n", name);
-- 
cgit v1.3-14-g43fede


From 58edae3aac9f2ccd1afb12ea08127e840a0a706c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sat, 8 Feb 2014 09:01:10 +0100
Subject: lto: Disable LTO for sys_ni

The assembler alias code in cond_syscall does not work
when compiled for LTO. Just disable LTO for that file.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1391846481-31491-6-git-send-email-ak@linux.intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/Makefile | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..31c26c61aaec 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,6 +18,9 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
+# cond_syscall is currently not LTO compatible
+CFLAGS_sys_ni.o = $(DISABLE_LTO)
+
 obj-y += sched/
 obj-y += locking/
 obj-y += power/
-- 
cgit v1.3-14-g43fede


From bad34660344f37db8b55ce8bc139bddc7d83af1b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 14 Feb 2014 16:54:28 +0800
Subject: cgroup: fix locking in cgroupstats_build()

css_set_lock has been converted to css_set_rwsem, and rwsem can't nest
inside rcu_read_lock.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15dcae74b510..5606c0f08d95 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2995,6 +2995,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	    kernfs_type(kn) != KERNFS_DIR)
 		return -EINVAL;
 
+	mutex_lock(&cgroup_mutex);
+
 	/*
 	 * We aren't being called from kernfs and there's no guarantee on
 	 * @kn->priv's validity.  For this and css_tryget_from_dir(),
@@ -3002,10 +3004,12 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	 */
 	rcu_read_lock();
 	cgrp = rcu_dereference(kn->priv);
-	if (!cgrp) {
+	if (!cgrp || cgroup_is_dead(cgrp)) {
 		rcu_read_unlock();
+		mutex_unlock(&cgroup_mutex);
 		return -ENOENT;
 	}
+	rcu_read_unlock();
 
 	css_task_iter_start(&cgrp->dummy_css, &it);
 	while ((tsk = css_task_iter_next(&it))) {
@@ -3030,7 +3034,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
 	}
 	css_task_iter_end(&it);
 
-	rcu_read_unlock();
+	mutex_unlock(&cgroup_mutex);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 6534fd6c15858fe4ce4ae568106225e68d5afa81 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 14 Feb 2014 16:55:04 +0800
Subject: cgroup: fix memory leak in cgroup_mount()

We should free the memory allocated in parse_cgroupfs_options() before
calling this function again.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5606c0f08d95..3fe01102607b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1540,6 +1540,8 @@ retry:
 		if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
+			kfree(opts.release_agent);
+			kfree(opts.name);
 			msleep(10);
 			goto retry;
 		}
-- 
cgit v1.3-14-g43fede


From 8ba14654282ed6bb386d0a2f1ab329bfb293403f Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 10 Feb 2014 17:09:54 +0100
Subject: timer: Spare IPI when deferrable timer is queued on idle remote
 targets

When a timer is enqueued or modified on a remote target, the latter is
expected to see and handle this timer on its next tick. However if the
target is idle and CONFIG_NO_HZ_IDLE=y, the CPU may be sleeping tickless
and the timer may be ignored.

wake_up_nohz_cpu() takes care of that by setting TIF_NEED_RESCHED and
sending an IPI to idle targets so that the tick is reevaluated on the
idle loop through the tick_nohz_idle_*() APIs.

Now this is all performed regardless of the power properties of the
timer. If the timer is deferrable, idle targets don't need to be woken
up. Only the next buzy tick needs to care about it, and no IPI kick
is needed for that to happen.

So lets spare the IPI on idle targets when the timer is deferrable.

Meanwhile we keep the current behaviour on full dynticks targets. We can
spare IPIs on idle full dynticks targets as well but some tricky races
against idle_cpu() must be dealt all along to make sure that the timer
is well handled after idle exit. We can deal with that later since
NO_HZ_FULL already has more important powersaving issues.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/CAKohpomMZ0TAN2e6N76_g4ZRzxd5vZ1XfuZfxrP7GMxfTNiLVw@mail.gmail.com
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/timer.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..b75e7893be14 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -939,8 +939,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	 * with the timer by holding the timer base lock. This also
 	 * makes sure that a CPU on the way to stop its tick can not
 	 * evaluate the timer wheel.
+	 *
+	 * Spare the IPI for deferrable timers on idle targets though.
+	 * The next busy ticks will take care of it. Except full dynticks
+	 * require special care against races with idle_cpu(), lets deal
+	 * with that later.
 	 */
-	wake_up_nohz_cpu(cpu);
+	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(cpu))
+		wake_up_nohz_cpu(cpu);
+
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
-- 
cgit v1.3-14-g43fede


From f96a34e27df19335155394a235ea3a096bc52a71 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 6 Feb 2014 13:36:21 -0500
Subject: nohz: ensure users are aware boot CPU is not NO_HZ_FULL

This bit of information is in the Kconfig help text:

  "Note the boot CPU will still be kept outside the range to
  handle the timekeeping duty."

However neither the variable NO_HZ_FULL_ALL, or the prompt
convey this important detail, so lets add it to the prompt
to make it more explicitly obvious to the average user.

Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1391711781-7466-1-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/time/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 3ce6e8c5f3fc..f448513a45ed 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -124,7 +124,7 @@ config NO_HZ_FULL
 endchoice
 
 config NO_HZ_FULL_ALL
-       bool "Full dynticks system on all CPUs by default"
+       bool "Full dynticks system on all CPUs by default (except CPU 0)"
        depends on NO_HZ_FULL
        help
          If the user doesn't pass the nohz_full boot option to
-- 
cgit v1.3-14-g43fede


From e4178d809fdaee32a56833fff1f5056c99e90a1a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 17 Feb 2014 12:24:45 -0800
Subject: printk: fix syslog() overflowing user buffer

This is not a buffer overflow in the traditional sense: we don't
overflow any *kernel* buffers, but we do mis-count the amount of data we
copy back to user space for the SYSLOG_ACTION_READ_ALL case.

In particular, if the user buffer is too small to hold everything, and
*if* there is a continuation line at just the right place, we can end up
giving the user more data than he asked for.

The reason is that we first count up the number of bytes all the log
records contains, then we walk the records again until we've skipped the
records at the beginning that won't fit, and then we walk the rest of
the records and copy them to the user space buffer.

And in between that "skip the initial records that won't fit" and the
"copy the records that *will* fit to user space", we reset the 'prev'
variable that contained the record information for the last record not
copied.  That meant that when we started copying to user space, we now
had a different character count than what we had originally calculated
in the first record walk-through.

The fix is to simply not clear the 'prev' flags value (in both cases
where we had the same logic: syslog_print_all and kmsg_dump_get_buffer:
the latter is used for pstore-like dumping)

Reported-and-tested-by: Debabrata Banerjee <dbanerje@akamai.com>
Acked-by: Kay Sievers <kay@vrfy.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b1d255f04135..4dae9cbe9259 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		next_seq = log_next_seq;
 
 		len = 0;
-		prev = 0;
 		while (len >= 0 && seq < next_seq) {
 			struct printk_log *msg = log_from_idx(idx);
 			int textlen;
@@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	next_idx = idx;
 
 	l = 0;
-	prev = 0;
 	while (seq < dumper->next_seq) {
 		struct printk_log *msg = log_from_idx(idx);
 
-- 
cgit v1.3-14-g43fede


From 3660c2813fb6d0ba48ee44bcbf9feddf7218c11d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 3 Dec 2013 09:24:02 -0800
Subject: rcu: Add ACCESS_ONCE() to ->n_force_qs_lh accesses

The ->n_force_qs_lh field is accessed without the benefit of any
synchronization, so this commit adds the needed ACCESS_ONCE() wrappers.
Yes, increments to ->n_force_qs_lh can be lost, but contention should
be low and the field is strictly statistical in nature, so this is not
a problem.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree.c       | 4 ++--
 kernel/rcu/tree_trace.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..e64157798624 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 		if (rnp_old != NULL)
 			raw_spin_unlock(&rnp_old->fqslock);
 		if (ret) {
-			rsp->n_force_qs_lh++;
+			ACCESS_ONCE(rsp->n_force_qs_lh)++;
 			return;
 		}
 		rnp_old = rnp;
@@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 	smp_mb__after_unlock_lock();
 	raw_spin_unlock(&rnp_old->fqslock);
 	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
-		rsp->n_force_qs_lh++;
+		ACCESS_ONCE(rsp->n_force_qs_lh)++;
 		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 		return;  /* Someone beat us to it. */
 	}
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d4..d1f1e64a6d72 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
+		   ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
-- 
cgit v1.3-14-g43fede


From 87de1cfdc55b16b794e245b07322340725149d62 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 3 Dec 2013 10:02:52 -0800
Subject: rcu: Stop tracking FSF's postal address

All of the RCU source files have the usual GPL header, which contains a
long-obsolete postal address for FSF.  To avoid the need to track the
FSF office's movements, this commit substitutes the URL where GPL may
be found.

Reported-by: Greg KH <gregkh@linuxfoundation.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 4 ++--
 include/linux/rcutiny.h  | 4 ++--
 include/linux/rcutree.h  | 4 ++--
 include/linux/srcu.h     | 4 ++--
 kernel/rcu/rcu.h         | 4 ++--
 kernel/rcu/srcu.c        | 4 ++--
 kernel/rcu/tiny.c        | 4 ++--
 kernel/rcu/tiny_plugin.h | 4 ++--
 kernel/rcu/torture.c     | 4 ++--
 kernel/rcu/tree.c        | 4 ++--
 kernel/rcu/tree.h        | 4 ++--
 kernel/rcu/tree_plugin.h | 4 ++--
 kernel/rcu/tree_trace.c  | 4 ++--
 kernel/rcu/update.c      | 4 ++--
 14 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 72bf3a01a4ee..b200756ea4c0 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2001
  *
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6f01771b571c..c364e9148de2 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 72137ee8c603..08b084068967 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 9b058eecd403..a2783cb5d275 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright (C) IBM Corporation, 2006
  * Copyright (C) Fujitsu, 2012
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5b..1bd787fddcb2 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2011
  *
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d8284384..5db7e9272d37 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright (C) IBM Corporation, 2006
  * Copyright (C) Fujitsu, 2012
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d024..53b95bbf4abb 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..431528520562 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright (c) 2010 Linaro
  *
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 732f8ae3086a..ab7dd192a50b 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright (C) IBM Corporation, 2005, 2006
  *
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e64157798624..321feef0f5c0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac9..75dc3c39a02a 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..f9b9cdd36c8d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright Red Hat, 2009
  * Copyright IBM Corporation, 2009
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index d1f1e64a6d72..5cdc62e1beeb 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2008
  *
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf233..fd0d5b5b8e7c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
  *
  * Copyright IBM Corporation, 2001
  *
-- 
cgit v1.3-14-g43fede


From cb1e78cfa267453bb19e7edafd214c03834b664c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 4 Dec 2013 18:42:03 -0800
Subject: rcu: Remove ACCESS_ONCE() from jiffies

Because jiffies is one of a very few variables marked "volatile", there
is no need to use ACCESS_ONCE() when accessing it.  This commit therefore
removes the redundant ACCESS_ONCE() wrappers.

Reported by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/torture.c | 4 ++--
 kernel/rcu/tree.c    | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index ab7dd192a50b..022c5312b725 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -1352,7 +1352,7 @@ rcu_torture_shutdown(void *arg)
 	unsigned long jiffies_snap;
 
 	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
-	jiffies_snap = ACCESS_ONCE(jiffies);
+	jiffies_snap = jiffies;
 	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
 	       !kthread_should_stop()) {
 		delta = shutdown_time - jiffies_snap;
@@ -1361,7 +1361,7 @@ rcu_torture_shutdown(void *arg)
 				 "rcu_torture_shutdown task: %lu jiffies remaining\n",
 				 torture_type, delta);
 		schedule_timeout_interruptible(delta);
-		jiffies_snap = ACCESS_ONCE(jiffies);
+		jiffies_snap = jiffies;
 	}
 	if (kthread_should_stop()) {
 		VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 321feef0f5c0..73c3cd2b87ac 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -837,7 +837,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	 * to the next.  Only do this for the primary flavor of RCU.
 	 */
 	if (rdp->rsp == rcu_state &&
-	    ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
+	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
 		rdp->rsp->jiffies_resched += 5;
 		resched_cpu(rdp->cpu);
 	}
@@ -847,7 +847,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
-	unsigned long j = ACCESS_ONCE(jiffies);
+	unsigned long j = jiffies;
 	unsigned long j1;
 
 	rsp->gp_start = j;
@@ -1005,7 +1005,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 
 	if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
 		return;
-	j = ACCESS_ONCE(jiffies);
+	j = jiffies;
 
 	/*
 	 * Lots of memory barriers to reject false positives.
-- 
cgit v1.3-14-g43fede


From 52e2bb958ac4f9b3c4bdd78606d279852fd72922 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Sun, 9 Feb 2014 14:35:11 +0100
Subject: rcu: Disambiguate CONFIG_RCU_NOCB_CPUs

This commit fixes a grammar issue in the rcu_nohz_full_cpu() comment
header, so that it is clear that the plural is CPUs not Kconfig options.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index f9b9cdd36c8d..fffe4178a23d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2893,7 +2893,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
  * CPU unless the grace period has extended for too long.
  *
  * This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPUs.
+ * CONFIG_RCU_NOCB_CPU CPUs.
  */
 static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
 {
-- 
cgit v1.3-14-g43fede


From ae1670339c95c3ff96ab10582506cf827c5fecc8 Mon Sep 17 00:00:00 2001
From: Shaibal Dutta <shaibal.dutta@broadcom.com>
Date: Fri, 31 Jan 2014 11:53:06 -0800
Subject: rcu: Move SRCU grace period work to power efficient workqueue

For better use of CPU idle time, allow the scheduler to select the CPU
on which the SRCU grace period work would be scheduled. This improves
idle residency time and conserves power.

This functionality is enabled when CONFIG_WQ_POWER_EFFICIENT is selected.

Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Signed-off-by: Shaibal Dutta <shaibal.dutta@broadcom.com>
[zoran.markovic@linaro.org: Rebased to latest kernel version. Added commit
message. Fixed code alignment.]
Signed-off-by: Zoran Markovic <zoran.markovic@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/srcu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 5db7e9272d37..2359779e1daa 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -398,7 +398,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 	rcu_batch_queue(&sp->batch_queue, head);
 	if (!sp->running) {
 		sp->running = true;
-		schedule_delayed_work(&sp->work, 0);
+		queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
 	}
 	spin_unlock_irqrestore(&sp->queue_lock, flags);
 }
@@ -674,7 +674,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
 	}
 
 	if (pending)
-		schedule_delayed_work(&sp->work, SRCU_INTERVAL);
+		queue_delayed_work(system_power_efficient_wq,
+				   &sp->work, SRCU_INTERVAL);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 2f33b512a5460578f6cf11d7b7867bed53157c7c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Nov 2013 18:25:48 -0800
Subject: rcu: Optimize rcu_is_nocb_cpu() for RCU_NOCB_CPU_ALL

If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_is_nocb_cpu() will always
return true, however, the current version nevertheless checks
rcu_nocb_mask.  This commit therefore creates a static inline
implementation of rcu_is_nocb_cpu() that unconditionally returns
true when CONFIG_RCU_NOCB_CPU_ALL=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 6 ++++--
 kernel/rcu/tree_plugin.h | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 72bf3a01a4ee..281c90f8989e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1015,11 +1015,13 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
-#ifdef CONFIG_RCU_NOCB_CPU
+#if defined(CONFIG_RCU_NOCB_CPU_ALL)
+static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
+#elif defined(CONFIG_RCU_NOCB_CPU)
 bool rcu_is_nocb_cpu(int cpu);
 #else
 static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+#endif
 
 
 /* Only for use by adaptive-ticks code. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b920..39a50b918bff 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2101,6 +2101,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 	init_waitqueue_head(&rnp->nocb_gp_wq[1]);
 }
 
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 /* Is the specified CPU a no-CPUs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
@@ -2108,6 +2109,7 @@ bool rcu_is_nocb_cpu(int cpu)
 		return cpumask_test_cpu(cpu, rcu_nocb_mask);
 	return false;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Enqueue the specified string of rcu_head structures onto the specified
-- 
cgit v1.3-14-g43fede


From ffa83fb565fbc397cbafb4b71fd1cce276d4c3b6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Nov 2013 19:27:16 -0800
Subject: rcu: Optimize rcu_needs_cpu() for RCU_NOCB_CPU_ALL

If CONFIG_RCU_NOCB_CPU_ALL=y, then rcu_needs_cpu() will always
return false, however, the current version nevertheless checks
for RCU callbacks.  This commit therefore creates a static inline
implementation of rcu_needs_cpu() that unconditionally returns false
when CONFIG_RCU_NOCB_CPU_ALL=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 8 ++++++++
 include/linux/rcutiny.h  | 6 ------
 include/linux/rcutree.h  | 2 ++
 kernel/rcu/tree.c        | 2 +-
 kernel/rcu/tree_plugin.h | 4 ++++
 5 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 281c90f8989e..5f7d5f410d50 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -1015,6 +1015,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
+#if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
+static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+{
+	*delta_jiffies = ULONG_MAX;
+	return 0;
+}
+#endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
+
 #if defined(CONFIG_RCU_NOCB_CPU_ALL)
 static inline bool rcu_is_nocb_cpu(int cpu) { return true; }
 #elif defined(CONFIG_RCU_NOCB_CPU)
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 6f01771b571c..9524903487d0 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -68,12 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 	call_rcu(head, func);
 }
 
-static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
-{
-	*delta_jiffies = ULONG_MAX;
-	return 0;
-}
-
 static inline void rcu_note_context_switch(int cpu)
 {
 	rcu_sched_qs(cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 72137ee8c603..81198c84e268 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -31,7 +31,9 @@
 #define __LINUX_RCUTREE_H
 
 void rcu_note_context_switch(int cpu);
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 void rcu_cpu_stall_reset(void);
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072d..c2c8234a0291 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2880,7 +2880,7 @@ static int rcu_pending(int cpu)
  * non-NULL, store an indication of whether all callbacks are lazy.
  * (If there are no callbacks, all of them are deemed to be lazy.)
  */
-static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
 	bool al = true;
 	bool hc = false;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 39a50b918bff..820b06aefbee 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
  * any flavor of RCU.
  */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
 	return rcu_cpu_has_callbacks(cpu, NULL);
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
  *
  * The caller must have disabled interrupts.
  */
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
 	}
 	return 0;
 }
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 
 /*
  * Prepare a CPU for idle from an RCU perspective.  The first major task
-- 
cgit v1.3-14-g43fede


From f1f399d1281ea339a08469f7e58193624992f620 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Nov 2013 21:08:07 -0800
Subject: rcu: Optimize RCU_FAST_NO_HZ for RCU_NOCB_CPU_ALL

If CONFIG_RCU_NOCB_CPU_ALL=y, then no CPU will ever have RCU callbacks
because these callbacks will instead be handled by the rcuo kthreads.
However, the current version of RCU_FAST_NO_HZ nevertheless checks for RCU
callbacks.  This commit therefore creates static inline implementations
of rcu_prepare_for_idle() and rcu_cleanup_after_idle() that are no-ops
when CONFIG_RCU_NOCB_CPU_ALL=y.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/tree_plugin.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 820b06aefbee..41afc3fbfb6c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1658,7 +1658,7 @@ extern int tick_nohz_active;
  * only if it has been awhile since the last time we did so.  Afterwards,
  * if there are any callbacks ready for immediate invocation, return true.
  */
-static bool rcu_try_advance_all_cbs(void)
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
 {
 	bool cbs_ready = false;
 	struct rcu_data *rdp;
@@ -1743,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
  */
 static void rcu_prepare_for_idle(int cpu)
 {
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 	struct rcu_node *rnp;
@@ -1794,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
 		rcu_accelerate_cbs(rsp, rnp, rdp);
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
@@ -1803,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
  */
 static void rcu_cleanup_after_idle(int cpu)
 {
-
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
 	if (rcu_is_nocb_cpu(cpu))
 		return;
 	if (rcu_try_advance_all_cbs())
 		invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 45a22f4c11fef4ecd5c61c0a299cd3f23d77be8e Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 17 Feb 2014 13:09:50 +0100
Subject: inotify: Fix reporting of cookies for inotify events

My rework of handling of notification events (namely commit 7053aee26a35
"fsnotify: do not share events between notification groups") broke
sending of cookies with inotify events. We didn't propagate the value
passed to fsnotify() properly and passed 4 uninitialized bytes to
userspace instead (so it is also an information leak). Sadly I didn't
notice this during my testing because inotify cookies aren't used very
much and LTP inotify tests ignore them.

Fix the problem by passing the cookie value properly.

Fixes: 7053aee26a3548ebaba046ae2e52396ccf56ac6c
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/notify/dnotify/dnotify.c          | 2 +-
 fs/notify/fanotify/fanotify.c        | 2 +-
 fs/notify/fsnotify.c                 | 2 +-
 fs/notify/inotify/inotify.h          | 2 +-
 fs/notify/inotify/inotify_fsnotify.c | 3 ++-
 fs/notify/inotify/inotify_user.c     | 2 +-
 include/linux/fsnotify_backend.h     | 2 +-
 kernel/audit_tree.c                  | 2 +-
 kernel/audit_watch.c                 | 2 +-
 9 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 0b9ff4395e6a..abc8cbcfe90e 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -86,7 +86,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, void *data, int data_type,
-				const unsigned char *file_name)
+				const unsigned char *file_name, u32 cookie)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0e792f5e3147..205dc2163822 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -147,7 +147,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct fsnotify_mark *inode_mark,
 				 struct fsnotify_mark *fanotify_mark,
 				 u32 mask, void *data, int data_type,
-				 const unsigned char *file_name)
+				 const unsigned char *file_name, u32 cookie)
 {
 	int ret = 0;
 	struct fanotify_event_info *event;
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 1d4e1ea2f37c..9d3e9c50066a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -179,7 +179,7 @@ static int send_to_group(struct inode *to_tell,
 
 	return group->ops->handle_event(group, to_tell, inode_mark,
 					vfsmount_mark, mask, data, data_is,
-					file_name);
+					file_name, cookie);
 }
 
 /*
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 485eef3f4407..ed855ef6f077 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -27,6 +27,6 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, void *data, int data_type,
-				const unsigned char *file_name);
+				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d5ee56348bb8..43ab1e1a07a2 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -67,7 +67,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
 			 u32 mask, void *data, int data_type,
-			 const unsigned char *file_name)
+			 const unsigned char *file_name, u32 cookie)
 {
 	struct inotify_inode_mark *i_mark;
 	struct inotify_event_info *event;
@@ -103,6 +103,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 	fsn_event = &event->fse;
 	fsnotify_init_event(fsn_event, inode, mask);
 	event->wd = i_mark->wd;
+	event->sync_cookie = cookie;
 	event->name_len = len;
 	if (len)
 		strcpy(event->name, file_name);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 497395c8274b..6528b5a54ca0 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -495,7 +495,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 
 	/* Queue ignore event for the watch */
 	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-			     NULL, FSNOTIFY_EVENT_NONE, NULL);
+			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 3d286ff49ab0..c84bc7c2bfc8 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -99,7 +99,7 @@ struct fsnotify_ops {
 			    struct fsnotify_mark *inode_mark,
 			    struct fsnotify_mark *vfsmount_mark,
 			    u32 mask, void *data, int data_type,
-			    const unsigned char *file_name);
+			    const unsigned char *file_name, u32 cookie);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
 	void (*free_event)(struct fsnotify_event *event);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67ccf0e7cca9..135944a7b28a 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
 				   struct fsnotify_mark *inode_mark,
 				   struct fsnotify_mark *vfsmount_mark,
 				   u32 mask, void *data, int data_type,
-				   const unsigned char *file_name)
+				   const unsigned char *file_name, u32 cookie)
 {
 	return 0;
 }
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2596fac5dcb4..70b4554d2fbe 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
 				    struct fsnotify_mark *inode_mark,
 				    struct fsnotify_mark *vfsmount_mark,
 				    u32 mask, void *data, int data_type,
-				    const unsigned char *dname)
+				    const unsigned char *dname, u32 cookie)
 {
 	struct inode *inode;
 	struct audit_parent *parent;
-- 
cgit v1.3-14-g43fede


From 5bdfff96c69a4d5ab9c49e60abf9e070ecd2acbb Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 15 Feb 2014 22:02:28 +0800
Subject: workqueue: ensure @task is valid across kthread_stop()

When a kworker should die, the kworkre is notified through WORKER_DIE
flag instead of kthread_should_stop().  This, IIRC, is primarily to
keep the test synchronized inside worker_pool lock.  WORKER_DIE is
first set while holding pool->lock, the lock is dropped and
kthread_stop() is called.

Unfortunately, this means that there's a slight chance that the target
kworker may see WORKER_DIE before kthread_stop() finishes and exits
and frees the target task before or during kthread_stop().

Fix it by pinning the target task before setting WORKER_DIE and
putting it after kthread_stop() is done.

tj: Improved patch description and comment.  Moved pinning above
    WORKER_DIE for better signify what it's protecting.

CC: stable@vger.kernel.org
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3b7473..193e977a10ea 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker)
 	if (worker->flags & WORKER_IDLE)
 		pool->nr_idle--;
 
+	/*
+	 * Once WORKER_DIE is set, the kworker may destroy itself at any
+	 * point.  Pin to ensure the task stays until we're done with it.
+	 */
+	get_task_struct(worker->task);
+
 	list_del_init(&worker->entry);
 	worker->flags |= WORKER_DIE;
 
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker)
 	spin_unlock_irq(&pool->lock);
 
 	kthread_stop(worker->task);
+	put_task_struct(worker->task);
 	kfree(worker);
 
 	spin_lock_irq(&pool->lock);
-- 
cgit v1.3-14-g43fede


From dc5736ed7aaf942caaac0c15af74a018e04ec79d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 17 Feb 2014 10:41:50 +0800
Subject: cgroup: add a validation check to cgroup_add_cftyps()

Fengguang reported this bug:

BUG: unable to handle kernel NULL pointer dereference at 0000003c
IP: [<cc90b4ad>] cgroup_cfts_commit+0x27/0x1c1
...
Call Trace:
  [<cc9d1129>] ? kmem_cache_alloc_trace+0x33f/0x3b7
  [<cc90c6fc>] cgroup_add_cftypes+0x8f/0xca
  [<cd78b646>] cgroup_init+0x6a/0x26a
  [<cd764d7d>] start_kernel+0x4d7/0x57a
  [<cd7642ef>] i386_start_kernel+0x92/0x96

This happens in a corner case. If CGROUP_SCHED=y but CFS_BANDWIDTH=n &&
FAIR_GROUP_SCHED=n && RT_GROUP_SCHED=n, we have:

cpu_files[] = {
	{ }	/* terminate */
}

When we pass cpu_files to cgroup_apply_cftypes(), as cpu_files[0].ss
is NULL, we'll access NULL pointer.

The bug was introduced by commit de00ffa56ea3132c6013fc8f07133b8a1014cf53
("cgroup: make cgroup_subsys->base_cftypes use cgroup_add_cftypes()").

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fe01102607b..771d1b8aaae9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2348,6 +2348,9 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	int ret;
 
+	if (!cfts || cfts[0].name[0] == '\0')
+		return 0;
+
 	ret = cgroup_init_cftypes(ss, cfts);
 	if (ret)
 		return ret;
-- 
cgit v1.3-14-g43fede


From 532de3fc72adc2a6525c4d53c07bf81e1732083d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Feb 2014 13:29:31 -0500
Subject: cgroup: update cgroup_enable_task_cg_lists() to grab siglock

Currently, there's nothing preventing cgroup_enable_task_cg_lists()
from missing set PF_EXITING and race against cgroup_exit().  Depending
on the timing, cgroup_exit() may finish with the task still linked on
css_set leading to list corruption.  Fix it by grabbing siglock in
cgroup_enable_task_cg_lists() so that PF_EXITING is guaranteed to be
visible.

This whole on-demand cg_list optimization is extremely fragile and has
ample possibility to lead to bugs which can cause things like
once-a-year oops during boot.  I'm wondering whether the better
approach would be just adding "cgroup_disable=all" handling which
disables the whole cgroup rather than tempting fate with this
on-demand craziness.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 kernel/cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 68d87103b493..105f273b6f86 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2905,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
 		 * We should check if the process is exiting, otherwise
 		 * it will race with cgroup_exit() in that the list
 		 * entry won't be deleted though the process has exited.
+		 * Do it while holding siglock so that we don't end up
+		 * racing against cgroup_exit().
 		 */
+		spin_lock_irq(&p->sighand->siglock);
 		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
 			list_add(&p->cg_list, &task_css_set(p)->tasks);
+		spin_unlock_irq(&p->sighand->siglock);
+
 		task_unlock(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
-- 
cgit v1.3-14-g43fede


From e227867f12302633737bd2a48a10a9a72c0630cb Mon Sep 17 00:00:00 2001
From: Masanari Iida <standby24x7@gmail.com>
Date: Tue, 18 Feb 2014 22:54:36 +0900
Subject: treewide: Fix typo in Documentation/DocBook

This patch fix spelling typo in Documentation/DocBook.
It is because .html and .xml files are generated by make htmldocs,
I have to fix a typo within the source files.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/s390/include/asm/cio.h         | 2 +-
 block/blk-core.c                    | 2 +-
 block/blk-map.c                     | 2 +-
 drivers/ata/libata-core.c           | 4 ++--
 drivers/base/bus.c                  | 2 +-
 drivers/gpu/drm/drm_crtc_helper.c   | 2 +-
 drivers/input/sparse-keymap.c       | 2 +-
 drivers/regulator/core.c            | 2 +-
 drivers/scsi/scsi_transport_iscsi.c | 6 +++---
 drivers/usb/core/message.c          | 2 +-
 drivers/usb/core/urb.c              | 2 +-
 fs/buffer.c                         | 2 +-
 fs/debugfs/inode.c                  | 6 +++---
 include/drm/drm_fb_helper.h         | 2 +-
 include/linux/hsi/hsi.h             | 2 +-
 include/linux/kfifo.h               | 2 +-
 include/linux/pipe_fs_i.h           | 2 +-
 include/linux/skbuff.h              | 2 +-
 include/linux/spi/spi.h             | 8 ++++----
 include/linux/usb/composite.h       | 2 +-
 include/net/mac80211.h              | 6 +++---
 kernel/relay.c                      | 2 +-
 kernel/signal.c                     | 2 +-
 net/core/dev.c                      | 2 +-
 24 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index d42625053c37..096339207764 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -199,7 +199,7 @@ struct esw_eadm {
 /**
  * struct irb - interruption response block
  * @scsw: subchannel status word
- * @esw: extened status word
+ * @esw: extended status word
  * @ecw: extended control word
  *
  * The irb that is handed to the device driver when an interrupt occurs. For
diff --git a/block/blk-core.c b/block/blk-core.c
index 8bdd0121212a..cd0158163fe0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1900,7 +1900,7 @@ EXPORT_SYMBOL(submit_bio);
  *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
- *    Such request stacking drivers should check those requests agaist
+ *    Such request stacking drivers should check those requests against
  *    the new queue limits again when they dispatch those requests,
  *    although such checkings are also done against the old queue limits
  *    when submitting requests.
diff --git a/block/blk-map.c b/block/blk-map.c
index 623e1cd4cffe..62382ad5b010 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -285,7 +285,7 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
  *
  * Description:
  *    Data will be mapped directly if possible. Otherwise a bounce
- *    buffer is used. Can be called multple times to append multple
+ *    buffer is used. Can be called multiple times to append multiple
  *    buffers.
  */
 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 75b93678bbcd..1274720e6bb9 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1524,7 +1524,7 @@ static void ata_qc_complete_internal(struct ata_queued_cmd *qc)
  *	@dev: Device to which the command is sent
  *	@tf: Taskfile registers for the command and the result
  *	@cdb: CDB for packet command
- *	@dma_dir: Data tranfer direction of the command
+ *	@dma_dir: Data transfer direction of the command
  *	@sgl: sg list for the data buffer of the command
  *	@n_elem: Number of sg entries
  *	@timeout: Timeout in msecs (0 for default)
@@ -1712,7 +1712,7 @@ unsigned ata_exec_internal_sg(struct ata_device *dev,
  *	@dev: Device to which the command is sent
  *	@tf: Taskfile registers for the command and the result
  *	@cdb: CDB for packet command
- *	@dma_dir: Data tranfer direction of the command
+ *	@dma_dir: Data transfer direction of the command
  *	@buf: Data buffer of the command
  *	@buflen: Length of data buffer
  *	@timeout: Timeout in msecs (0 for default)
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 73f6c2925281..1db22d3c4036 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1209,7 +1209,7 @@ err_dev:
  * with the name of the subsystem. The root device can carry subsystem-
  * wide attributes. All registered devices are below this single root
  * device and are named after the subsystem with a simple enumeration
- * number appended. The registered devices are not explicitely named;
+ * number appended. The registered devices are not explicitly named;
  * only 'id' in the device needs to be set.
  *
  * Do not use this interface for anything new, it exists for compatibility
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index 01361aba033b..0058fd74063e 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -593,7 +593,7 @@ drm_crtc_helper_disable(struct drm_crtc *crtc)
  * Caller must hold mode config lock.
  *
  * Setup a new configuration, provided by the upper layers (either an ioctl call
- * from userspace or internally e.g. from the fbdev suppport code) in @set, and
+ * from userspace or internally e.g. from the fbdev support code) in @set, and
  * enable it. This is the main helper functions for drivers that implement
  * kernel mode setting with the crtc helper functions and the assorted
  * ->prepare(), ->modeset() and ->commit() helper callbacks.
diff --git a/drivers/input/sparse-keymap.c b/drivers/input/sparse-keymap.c
index a70aa555bbff..e7409c45bdd0 100644
--- a/drivers/input/sparse-keymap.c
+++ b/drivers/input/sparse-keymap.c
@@ -236,7 +236,7 @@ EXPORT_SYMBOL(sparse_keymap_setup);
  * in an input device that was set up by sparse_keymap_setup().
  * NOTE: It is safe to cal this function while input device is
  * still registered (however the drivers should care not to try to
- * use freed keymap and thus have to shut off interrups/polling
+ * use freed keymap and thus have to shut off interrupts/polling
  * before freeing the keymap).
  */
 void sparse_keymap_free(struct input_dev *dev)
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index d85f31385b24..d59aa96a4dc4 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2134,7 +2134,7 @@ EXPORT_SYMBOL_GPL(regulator_is_enabled);
  * @regulator: regulator source
  *
  * Returns positive if the regulator driver backing the source/client
- * can change its voltage, false otherwise. Usefull for detecting fixed
+ * can change its voltage, false otherwise. Useful for detecting fixed
  * or dummy regulators and disabling voltage change logic in the client
  * driver.
  */
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 63a6ca49d4e5..de5b4d9bb022 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -891,7 +891,7 @@ struct bus_type iscsi_flashnode_bus = {
  * Adds a sysfs entry for the flashnode session attributes
  *
  * Returns:
- *  pointer to allocated flashnode sess on sucess
+ *  pointer to allocated flashnode sess on success
  *  %NULL on failure
  */
 struct iscsi_bus_flash_session *
@@ -1089,7 +1089,7 @@ static int iscsi_iter_destroy_flashnode_conn_fn(struct device *dev, void *data)
 }
 
 /**
- * iscsi_destroy_flashnode_sess - destory flashnode session entry
+ * iscsi_destroy_flashnode_sess - destroy flashnode session entry
  * @fnode_sess: pointer to flashnode session entry to be destroyed
  *
  * Deletes the flashnode session entry and all children flashnode connection
@@ -1119,7 +1119,7 @@ static int iscsi_iter_destroy_flashnode_fn(struct device *dev, void *data)
 }
 
 /**
- * iscsi_destroy_all_flashnode - destory all flashnode session entries
+ * iscsi_destroy_all_flashnode - destroy all flashnode session entries
  * @shost: pointer to host data
  *
  * Destroys all the flashnode session entries and all corresponding children
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index bb315970e475..874d1a406ebc 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -179,7 +179,7 @@ EXPORT_SYMBOL_GPL(usb_control_msg);
  *
  * Return:
  * If successful, 0. Otherwise a negative error number. The number of actual
- * bytes transferred will be stored in the @actual_length paramater.
+ * bytes transferred will be stored in the @actual_length parameter.
  */
 int usb_interrupt_msg(struct usb_device *usb_dev, unsigned int pipe,
 		      void *data, int len, int *actual_length, int timeout)
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index e62208356c89..e726f5e80448 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -834,7 +834,7 @@ EXPORT_SYMBOL_GPL(usb_unpoison_anchored_urbs);
  *
  * this allows all outstanding URBs to be unlinked starting
  * from the back of the queue. This function is asynchronous.
- * The unlinking is just tiggered. It may happen after this
+ * The unlinking is just triggered. It may happen after this
  * function has returned.
  *
  * This routine should not be called by a driver after its disconnect
diff --git a/fs/buffer.c b/fs/buffer.c
index 6024877335ca..a20f2eb107ed 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3086,7 +3086,7 @@ EXPORT_SYMBOL(submit_bh);
  * until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
  * any waiters. 
  *
  * All of the buffers must be for the same device, and must also be a
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 9c0444cccbe1..ca4a08f38374 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -358,7 +358,7 @@ exit:
  * @name: a pointer to a string containing the name of the file to create.
  * @mode: the permission that the file should have.
  * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is NULL, then the
  *          file will be created in the root of the debugfs filesystem.
  * @data: a pointer to something that the caller will want to get to later
  *        on.  The inode.i_private pointer will point to this value on
@@ -400,7 +400,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
  * @name: a pointer to a string containing the name of the directory to
  *        create.
  * @parent: a pointer to the parent dentry for this file.  This should be a
- *          directory dentry if set.  If this paramater is NULL, then the
+ *          directory dentry if set.  If this parameter is NULL, then the
  *          directory will be created in the root of the debugfs filesystem.
  *
  * This function creates a directory in debugfs with the given name.
@@ -425,7 +425,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  * @name: a pointer to a string containing the name of the symbolic link to
  *        create.
  * @parent: a pointer to the parent dentry for this symbolic link.  This
- *          should be a directory dentry if set.  If this paramater is NULL,
+ *          should be a directory dentry if set.  If this parameter is NULL,
  *          then the symbolic link will be created in the root of the debugfs
  *          filesystem.
  * @target: a pointer to a string containing the path to the target of the
diff --git a/include/drm/drm_fb_helper.h b/include/drm/drm_fb_helper.h
index 471f276ce8f7..0145b948b147 100644
--- a/include/drm/drm_fb_helper.h
+++ b/include/drm/drm_fb_helper.h
@@ -55,7 +55,7 @@ struct drm_fb_helper_surface_size {
  *             save the current lut when force-restoring the fbdev for e.g.
  *             kdbg.
  * @fb_probe: Driver callback to allocate and initialize the fbdev info
- *            structure. Futhermore it also needs to allocate the drm
+ *            structure. Furthermore it also needs to allocate the drm
  *            framebuffer used to back the fbdev.
  * @initial_config: Setup an initial fbdev display configuration
  *
diff --git a/include/linux/hsi/hsi.h b/include/linux/hsi/hsi.h
index 0dca785288cf..39bfd5b89077 100644
--- a/include/linux/hsi/hsi.h
+++ b/include/linux/hsi/hsi.h
@@ -178,7 +178,7 @@ static inline void hsi_unregister_client_driver(struct hsi_client_driver *drv)
  * @complete: Transfer completion callback
  * @destructor: Destructor to free resources when flushing
  * @status: Status of the transfer when completed
- * @actual_len: Actual length of data transfered on completion
+ * @actual_len: Actual length of data transferred on completion
  * @channel: Channel were to TX/RX the message
  * @ttype: Transfer type (TX if set, RX otherwise)
  * @break_frame: if true HSI will send/receive a break frame. Data buffers are
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index 552d51efb429..554fde3a3927 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -722,7 +722,7 @@ __kfifo_uint_must_check_helper( \
 /**
  * kfifo_dma_out_finish - finish a DMA OUT operation
  * @fifo: address of the fifo to be used
- * @len: number of bytes transferd
+ * @len: number of bytes transferrd
  *
  * This macro finish a DMA OUT operation. The out counter will be updated by
  * the len parameter. No error checking will be done.
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index b8809fef61f5..11982d0ce11b 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -35,7 +35,7 @@ struct pipe_buffer {
  *	@tmp_page: cached released page
  *	@readers: number of current readers of this pipe
  *	@writers: number of current writers of this pipe
- *	@files: number of struct file refering this pipe (protected by ->i_lock)
+ *	@files: number of struct file referring this pipe (protected by ->i_lock)
  *	@waiting_writers: number of writers blocked waiting for room
  *	@r_counter: reader counter
  *	@w_counter: writer counter
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 215b5ea1cb30..cde842513df2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1951,7 +1951,7 @@ static inline void skb_propagate_pfmemalloc(struct page *page,
 }
 
 /**
- * skb_frag_page - retrieve the page refered to by a paged fragment
+ * skb_frag_page - retrieve the page referred to by a paged fragment
  * @frag: the paged fragment
  *
  * Returns the &struct page associated with @frag.
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 8c62ba74dd91..8d3a37bc6110 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -234,7 +234,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @mode_bits: flags understood by this controller driver
  * @bits_per_word_mask: A mask indicating which values of bits_per_word are
  *	supported by the driver. Bit n indicates that a bits_per_word n+1 is
- *	suported. If set, the SPI core will reject any transfer with an
+ *	supported. If set, the SPI core will reject any transfer with an
  *	unsupported bits_per_word. If not set, this value is simply ignored,
  *	and it's up to the individual driver to perform any validation.
  * @min_speed_hz: Lowest supported transfer speed
@@ -259,7 +259,7 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  * @cur_msg: the currently in-flight message
  * @cur_msg_prepared: spi_prepare_message was called for the currently
  *                    in-flight message
- * @xfer_completion: used by core tranfer_one_message()
+ * @xfer_completion: used by core transfer_one_message()
  * @busy: message pump is busy
  * @running: message pump is running
  * @rt: whether this queue is set to run as a realtime task
@@ -493,7 +493,7 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
  * @rx_buf: data to be read (dma-safe memory), or NULL
  * @tx_dma: DMA address of tx_buf, if @spi_message.is_dma_mapped
  * @rx_dma: DMA address of rx_buf, if @spi_message.is_dma_mapped
- * @tx_nbits: number of bits used for writting. If 0 the default
+ * @tx_nbits: number of bits used for writing. If 0 the default
  *      (SPI_NBITS_SINGLE) is used.
  * @rx_nbits: number of bits used for reading. If 0 the default
  *      (SPI_NBITS_SINGLE) is used.
@@ -551,7 +551,7 @@ extern struct spi_master *spi_busnum_to_master(u16 busnum);
  * by the results of previous messages and where the whole transaction
  * ends when the chipselect goes intactive.
  *
- * When SPI can transfer in 1x,2x or 4x. It can get this tranfer information
+ * When SPI can transfer in 1x,2x or 4x. It can get this transfer information
  * from device through @tx_nbits and @rx_nbits. In Bi-direction, these
  * two should both be set. User can set transfer mode with SPI_NBITS_SINGLE(1x)
  * SPI_NBITS_DUAL(2x) and SPI_NBITS_QUAD(4x) to support these three transfer.
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 5e61589fc166..0e7a555cab1e 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -92,7 +92,7 @@ struct usb_configuration;
  * @suspend: Notifies functions when the host stops sending USB traffic.
  * @resume: Notifies functions when the host restarts USB traffic.
  * @get_status: Returns function status as a reply to
- *	GetStatus() request when the recepient is Interface.
+ *	GetStatus() request when the recipient is Interface.
  * @func_suspend: callback to be called when
  *	SetFeature(FUNCTION_SUSPEND) is reseived
  *
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7ceed99a05bc..6b79bfc98175 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1841,7 +1841,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  *
  * Driver informs U-APSD client support by enabling
  * %IEEE80211_HW_SUPPORTS_UAPSD flag. The mode is configured through the
- * uapsd paramater in conf_tx() operation. Hardware needs to send the QoS
+ * uapsd parameter in conf_tx() operation. Hardware needs to send the QoS
  * Nullfunc frames and stay awake until the service period has ended. To
  * utilize U-APSD, dynamic powersave is disabled for voip AC and all frames
  * from that AC are transmitted with powersave enabled.
@@ -2047,7 +2047,7 @@ void ieee80211_free_txskb(struct ieee80211_hw *hw, struct sk_buff *skb);
  * with the number of frames to be released and which TIDs they are
  * to come from. In this case, the driver is responsible for setting
  * the EOSP (for uAPSD) and MORE_DATA bits in the released frames,
- * to help the @more_data paramter is passed to tell the driver if
+ * to help the @more_data parameter is passed to tell the driver if
  * there is more data on other TIDs -- the TIDs to release frames
  * from are ignored since mac80211 doesn't know how many frames the
  * buffers for those TIDs contain.
@@ -2592,7 +2592,7 @@ enum ieee80211_roc_type {
  *	parameters. In the case where the driver buffers some frames for
  *	sleeping stations mac80211 will use this callback to tell the driver
  *	to release some frames, either for PS-poll or uAPSD.
- *	Note that if the @more_data paramter is %false the driver must check
+ *	Note that if the @more_data parameter is %false the driver must check
  *	if there are more frames on the given TIDs, and if there are more than
  *	the frames being released then it must still set the more-data bit in
  *	the frame. If the @more_data parameter is %true, then of course the
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..52d6a6f56261 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -227,7 +227,7 @@ static void relay_destroy_buf(struct rchan_buf *buf)
  *	relay_remove_buf - remove a channel buffer
  *	@kref: target kernel reference that contains the relay buffer
  *
- *	Removes the file from the fileystem, which also frees the
+ *	Removes the file from the filesystem, which also frees the
  *	rchan_buf_struct and the channel buffer.  Should only be called from
  *	kref_put().
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30ee9a30..f4812283c6e9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2382,7 +2382,7 @@ relock:
  * @regs:		user register state
  * @stepping:		nonzero if debugger single-step or block-step in use
  *
- * This function should be called when a signal has succesfully been
+ * This function should be called when a signal has successfully been
  * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
  * is always blocked, and the signal itself is blocked unless %SA_NODEFER
  * is set in @ka->sa.sa_flags.  Tracing is notified.
diff --git a/net/core/dev.c b/net/core/dev.c
index d2b87dbbbb1a..70d2da3bfb0d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3424,7 +3424,7 @@ out:
  *	@rx_handler: receive handler to register
  *	@rx_handler_data: data pointer that is used by rx handler
  *
- *	Register a receive hander for a device. This handler will then be
+ *	Register a receive handler for a device. This handler will then be
  *	called from __netif_receive_skb. A negative errno code is returned
  *	on a failure.
  *
-- 
cgit v1.3-14-g43fede


From 392b21897d6cbff55c5b28910dfdc74e3020de6c Mon Sep 17 00:00:00 2001
From: Brian Campbell <brian.campbell@editshare.com>
Date: Sun, 16 Feb 2014 22:58:12 -0500
Subject: user_namespace.c: Remove duplicated word in comment

Signed-off-by: Brian Campbell <brian.campbell@editshare.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/user_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..dd06439b9c84 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
  *
  *	When there is no mapping defined for the user-namespace uid
  *	pair INVALID_UID is returned.  Callers are expected to test
- *	for and handle handle INVALID_UID being returned.  INVALID_UID
+ *	for and handle INVALID_UID being returned.  INVALID_UID
  *	may be tested for using uid_valid().
  */
 kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
-- 
cgit v1.3-14-g43fede


From 5ae8aabeaec3fe69c4fb21cbe5b17b72b35b5892 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Mon, 17 Feb 2014 10:45:36 -0800
Subject: sched_clock: Prevent callers from seeing half-updated data

The generic sched_clock registration function was previously
done lockless, due to the fact that it was expected to be called
only once. However, now there are systems that may register
multiple sched_clock sources, for which the lack of locking has
casued problems:

If two sched_clock sources are registered we may end up in a
situation where a call to sched_clock() may be accessing the
epoch cycle count for the old counter and the cycle count for the
new counter. This can lead to confusing results where
sched_clock() values jump and then are reset to 0 (due to the way
the registration function forces the epoch_ns to be 0).

Fix this by reorganizing the registration function to hold the
seqlock for as short a time as possible while we update the
clock_data structure for a new counter. We also put any
accumulated time into epoch_ns instead of resetting the time to
0 so that the clock doesn't reset after each successful
registration.

[jstultz: Added extra context to the commit message]

Reported-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Josh Cartwright <joshc@codeaurora.org>
Link: http://lkml.kernel.org/r/1392662736-7803-2-git-send-email-john.stultz@linaro.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/sched_clock.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0abb36464281..4d23dc4d8139 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 void __init sched_clock_register(u64 (*read)(void), int bits,
 				 unsigned long rate)
 {
+	u64 res, wrap, new_mask, new_epoch, cyc, ns;
+	u32 new_mult, new_shift;
+	ktime_t new_wrap_kt;
 	unsigned long r;
-	u64 res, wrap;
 	char r_unit;
 
 	if (cd.rate > rate)
 		return;
 
 	WARN_ON(!irqs_disabled());
-	read_sched_clock = read;
-	sched_clock_mask = CLOCKSOURCE_MASK(bits);
-	cd.rate = rate;
 
 	/* calculate the mult/shift to convert counter ticks to ns. */
-	clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
+	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+
+	new_mask = CLOCKSOURCE_MASK(bits);
+
+	/* calculate how many ns until we wrap */
+	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
+	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+	/* update epoch for new counter and update epoch_ns from old counter*/
+	new_epoch = read();
+	cyc = read_sched_clock();
+	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+			  cd.mult, cd.shift);
+
+	raw_write_seqcount_begin(&cd.seq);
+	read_sched_clock = read;
+	sched_clock_mask = new_mask;
+	cd.rate = rate;
+	cd.wrap_kt = new_wrap_kt;
+	cd.mult = new_mult;
+	cd.shift = new_shift;
+	cd.epoch_cyc = new_epoch;
+	cd.epoch_ns = ns;
+	raw_write_seqcount_end(&cd.seq);
 
 	r = rate;
 	if (r >= 4000000) {
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 	} else
 		r_unit = ' ';
 
-	/* calculate how many ns until we wrap */
-	wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
-	cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-
 	/* calculate the ns resolution of this counter */
-	res = cyc_to_ns(1ULL, cd.mult, cd.shift);
+	res = cyc_to_ns(1ULL, new_mult, new_shift);
+
 	pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
 		bits, r, r_unit, res, wrap);
 
-	update_sched_clock();
-
-	/*
-	 * Ensure that sched_clock() starts off at 0ns
-	 */
-	cd.epoch_ns = 0;
-
 	/* Enable IRQ time accounting if we have a fast enough sched_clock */
 	if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
 		enable_sched_clock_irqtime();
-- 
cgit v1.3-14-g43fede


From 18258f7239a61d8929b8e0c7b6d46c446459074c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 15 Feb 2014 00:55:18 +0000
Subject: genirq: Provide synchronize_hardirq()

synchronize_irq() waits for hard irq and threaded handlers to complete
before returning. For some special cases we only need to make sure
that the hard interrupt part of the irq line is not in progress when
we disabled the - possibly shared - interrupt at the device level.

A proper use case for this was provided by Russell. The sdhci driver
requires some irq triggered functions to be run in thread context. The
current implementation of the thread context is a sdio private kthread
construct, which has quite some shortcomings. These can be avoided
when the thread is directly associated to the device interrupt via the
generic threaded irq infrastructure.

Though there is a corner case related to run time power management
where one side disables the device interrupts at the device level and
needs to make sure, that an already running hard interrupt handler has
completed before proceeding further. Though that hard interrupt
handler might wake the associated thread, which in turn can request
the runtime PM to reenable the device. Using synchronize_irq() leads
to an immediate deadlock of the irq thread waiting for the PM lock and
the synchronize_irq() waiting for the irq thread to complete.

Due to the fact that it is sufficient for this case to ensure that no
hard irq handler is executing a new function which avoids the check
for the thread is required.

Add a function, which just monitors the hard irq parts and ignores the
threaded handlers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Russell King <linux@arm.linux.org.uk>
Cc: Chris Ball <chris@printf.net>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140215003823.653236081@linutronix.de
---
 include/linux/hardirq.h |  1 +
 kernel/irq/manage.c     | 70 +++++++++++++++++++++++++++++++++++--------------
 2 files changed, 51 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 12d5f972f23f..cba442ec3c66 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -9,6 +9,7 @@
 
 
 extern void synchronize_irq(unsigned int irq);
+extern void synchronize_hardirq(unsigned int irq);
 
 #if defined(CONFIG_TINY_RCU)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b17..274ba9238fb7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -32,24 +32,10 @@ static int __init setup_forced_irqthreads(char *arg)
 early_param("threadirqs", setup_forced_irqthreads);
 #endif
 
-/**
- *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
- *	@irq: interrupt number to wait for
- *
- *	This function waits for any pending IRQ handlers for this interrupt
- *	to complete before returning. If you use this function while
- *	holding a resource the IRQ handler may need you will deadlock.
- *
- *	This function may be called - with care - from IRQ context.
- */
-void synchronize_irq(unsigned int irq)
+static void __synchronize_hardirq(struct irq_desc *desc)
 {
-	struct irq_desc *desc = irq_to_desc(irq);
 	bool inprogress;
 
-	if (!desc)
-		return;
-
 	do {
 		unsigned long flags;
 
@@ -67,12 +53,56 @@ void synchronize_irq(unsigned int irq)
 
 		/* Oops, that failed? */
 	} while (inprogress);
+}
 
-	/*
-	 * We made sure that no hardirq handler is running. Now verify
-	 * that no threaded handlers are active.
-	 */
-	wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
+/**
+ *	synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
+ *	@irq: interrupt number to wait for
+ *
+ *	This function waits for any pending hard IRQ handlers for this
+ *	interrupt to complete before returning. If you use this
+ *	function while holding a resource the IRQ handler may need you
+ *	will deadlock. It does not take associated threaded handlers
+ *	into account.
+ *
+ *	Do not use this for shutdown scenarios where you must be sure
+ *	that all parts (hardirq and threaded handler) have completed.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_hardirq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc)
+		__synchronize_hardirq(desc);
+}
+EXPORT_SYMBOL(synchronize_hardirq);
+
+/**
+ *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *	@irq: interrupt number to wait for
+ *
+ *	This function waits for any pending IRQ handlers for this interrupt
+ *	to complete before returning. If you use this function while
+ *	holding a resource the IRQ handler may need you will deadlock.
+ *
+ *	This function may be called - with care - from IRQ context.
+ */
+void synchronize_irq(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc) {
+		__synchronize_hardirq(desc);
+		/*
+		 * We made sure that no hardirq handler is
+		 * running. Now verify that no threaded handlers are
+		 * active.
+		 */
+		wait_event(desc->wait_for_threads,
+			   !atomic_read(&desc->threads_active));
+	}
 }
 EXPORT_SYMBOL(synchronize_irq);
 
-- 
cgit v1.3-14-g43fede


From a92444c6b2225a9115d661c950cb48a22aeace20 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 15 Feb 2014 00:55:19 +0000
Subject: genirq: Provide irq_wake_thread()

In course of the sdhci/sdio discussion with Russell about killing the
sdio kthread hackery we discovered the need to be able to wake an
interrupt thread from software.

The rationale for this is, that sdio hardware can lack proper
interrupt support for certain features. So the driver needs to poll
the status registers, but at the same time it needs to be woken up by
an hardware interrupt.

To be able to get rid of the home brewn kthread construct of sdio we
need a way to wake an irq thread independent of an actual hardware
interrupt.

Provide an irq_wake_thread() function which wakes up the thread which
is associated to a given dev_id. This allows sdio to invoke the irq
thread from the hardware irq handler via the IRQ_WAKE_THREAD return
value and provides a possibility to wake it via a timer for the
polling scenarios. That allows to simplify the sdio logic
significantly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Chris Ball <chris@printf.net>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140215003823.772565780@linutronix.de
---
 include/linux/interrupt.h |  1 +
 kernel/irq/handle.c       |  4 ++--
 kernel/irq/internals.h    |  1 +
 kernel/irq/manage.c       | 27 +++++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index a2678d35b5a2..c7bfac1c4a7b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -188,6 +188,7 @@ extern void disable_irq(unsigned int irq);
 extern void disable_percpu_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 extern void enable_percpu_irq(unsigned int irq, unsigned int type);
+extern void irq_wake_thread(unsigned int irq, void *dev_id);
 
 /* The following three functions are for the core kernel use only. */
 extern void suspend_device_irqs(void);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 131ca176b497..bfec453557b4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,7 +51,7 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
 {
 	/*
 	 * In case the thread crashed and was killed we just pretend that
@@ -157,7 +157,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
 				break;
 			}
 
-			irq_wake_thread(desc, action);
+			__irq_wake_thread(desc, action);
 
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 001fa5bab490..d61ac29e32d0 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -82,6 +82,7 @@ irqreturn_t handle_irq_event(struct irq_desc *desc);
 /* Resending of interrupts :*/
 void check_irq_resend(struct irq_desc *desc, unsigned int irq);
 bool irq_wait_for_poll(struct irq_desc *desc);
+void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 274ba9238fb7..54eb5c99351b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -911,6 +911,33 @@ static int irq_thread(void *data)
 	return 0;
 }
 
+/**
+ *	irq_wake_thread - wake the irq thread for the action identified by dev_id
+ *	@irq:		Interrupt line
+ *	@dev_id:	Device identity for which the thread should be woken
+ *
+ */
+void irq_wake_thread(unsigned int irq, void *dev_id)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irqaction *action;
+	unsigned long flags;
+
+	if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+		return;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	for (action = desc->action; action; action = action->next) {
+		if (action->dev_id == dev_id) {
+			if (action->thread)
+				__irq_wake_thread(desc, action);
+			break;
+		}
+	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+EXPORT_SYMBOL_GPL(irq_wake_thread);
+
 static void irq_setup_forced_threading(struct irqaction *new)
 {
 	if (!force_irqthreads)
-- 
cgit v1.3-14-g43fede


From b04c644e670f79417f1728e6be310cfd8e6a921b Mon Sep 17 00:00:00 2001
From: Chuansheng Liu <chuansheng.liu@intel.com>
Date: Mon, 10 Feb 2014 16:13:57 +0800
Subject: genirq: Update the a comment typo

Change the comment "chasnge" to "change".

Signed-off-by: Chuansheng Liu <chuansheng.liu@intel.com>
Link: http://lkml.kernel.org/r/1392020037-5484-2-git-send-email-chuansheng.liu@intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 54eb5c99351b..ada0c548c36a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -757,7 +757,7 @@ out_unlock:
 
 #ifdef CONFIG_SMP
 /*
- * Check whether we need to chasnge the affinity of the interrupt thread.
+ * Check whether we need to change the affinity of the interrupt thread.
  */
 static void
 irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
-- 
cgit v1.3-14-g43fede


From 8c1a49aedb73fb2f15aaa32ad9e2e1c4289f45cb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 10 Jan 2014 11:13:54 -0500
Subject: tracing: Pass trace_array to set_flag callback

As options (flags) may affect instances instead of being global
the set_flag() callbacks need to receive the trace_array descriptor
of the instance they will be modifying.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/blktrace.c              |  3 ++-
 kernel/trace/trace.c                 | 18 ++++++++++--------
 kernel/trace/trace.h                 |  3 ++-
 kernel/trace/trace_functions.c       |  3 ++-
 kernel/trace/trace_functions_graph.c |  3 ++-
 kernel/trace/trace_irqsoff.c         |  6 ++++--
 kernel/trace/trace_nop.c             |  2 +-
 kernel/trace/trace_sched_wakeup.c    |  6 ++++--
 8 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b418cb0d7242..0d758ca61933 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1427,7 +1427,8 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	return print_one_line(iter, true);
 }
 
-static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+static int
+blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	/* don't output context-info for blk_classic output */
 	if (bit == TRACE_BLK_OPT_CLASSIC) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..d7dfc7efc4bf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -73,7 +73,8 @@ static struct tracer_flags dummy_tracer_flags = {
 	.opts = dummy_tracer_opt
 };
 
-static int dummy_set_flag(u32 old_flags, u32 bit, int set)
+static int
+dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	return 0;
 }
@@ -3339,13 +3340,14 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static int __set_tracer_option(struct tracer *trace,
+static int __set_tracer_option(struct trace_array *tr,
 			       struct tracer_flags *tracer_flags,
 			       struct tracer_opt *opts, int neg)
 {
+	struct tracer *trace = tr->current_trace;
 	int ret;
 
-	ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
+	ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg);
 	if (ret)
 		return ret;
 
@@ -3357,8 +3359,9 @@ static int __set_tracer_option(struct tracer *trace,
 }
 
 /* Try to assign a tracer specific option */
-static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
+static int set_tracer_option(struct trace_array *tr, char *cmp, int neg)
 {
+	struct tracer *trace = tr->current_trace;
 	struct tracer_flags *tracer_flags = trace->flags;
 	struct tracer_opt *opts = NULL;
 	int i;
@@ -3367,8 +3370,7 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 		opts = &tracer_flags->opts[i];
 
 		if (strcmp(cmp, opts->name) == 0)
-			return __set_tracer_option(trace, trace->flags,
-						   opts, neg);
+			return __set_tracer_option(tr, trace->flags, opts, neg);
 	}
 
 	return -EINVAL;
@@ -3440,7 +3442,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
 
 	/* If no option could be set, test the specific tracer options */
 	if (!trace_options[i])
-		ret = set_tracer_option(tr->current_trace, cmp, neg);
+		ret = set_tracer_option(tr, cmp, neg);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -5689,7 +5691,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (!!(topt->flags->val & topt->opt->bit) != val) {
 		mutex_lock(&trace_types_lock);
-		ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
+		ret = __set_tracer_option(topt->tr, topt->flags,
 					  topt->opt, !val);
 		mutex_unlock(&trace_types_lock);
 		if (ret)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 02b592f2d4b7..649a23d421c1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -355,7 +355,8 @@ struct tracer {
 	void			(*print_header)(struct seq_file *m);
 	enum print_line_t	(*print_line)(struct trace_iterator *iter);
 	/* If you handled the flag setting, return 0 */
-	int			(*set_flag)(u32 old_flags, u32 bit, int set);
+	int			(*set_flag)(struct trace_array *tr,
+					    u32 old_flags, u32 bit, int set);
 	/* Return 0 if OK with change, else return non-zero */
 	int			(*flag_changed)(struct tracer *tracer,
 						u32 mask, int set);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 38fe1483c508..85e517e84f50 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -175,7 +175,8 @@ static void tracing_stop_function_trace(void)
 		unregister_ftrace_function(&trace_ops);
 }
 
-static int func_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	switch (bit) {
 	case TRACE_FUNC_OPT_STACK:
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 0b99120d395c..deff11200261 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1476,7 +1476,8 @@ void graph_trace_close(struct trace_iterator *iter)
 	}
 }
 
-static int func_graph_set_flag(u32 old_flags, u32 bit, int set)
+static int
+func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	if (bit == TRACE_GRAPH_PRINT_IRQS)
 		ftrace_graph_skip_irqs = !set;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2aefbee93a6d..fd99b0c183ac 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -160,7 +160,8 @@ static struct ftrace_ops trace_ops __read_mostly =
 #endif /* CONFIG_FUNCTION_TRACER */
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	int cpu;
 
@@ -266,7 +267,8 @@ __trace_function(struct trace_array *tr,
 #else
 #define __trace_function trace_function
 
-static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
+static int
+irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	return -EINVAL;
 }
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 394f94417e2f..f3984098c0d7 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -62,7 +62,7 @@ static void nop_trace_reset(struct trace_array *tr)
  * If you don't implement it, then the flag setting will be
  * automatically accepted.
  */
-static int nop_set_flag(u32 old_flags, u32 bit, int set)
+static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	/*
 	 * Note that you don't need to update nop_flags.val yourself.
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 6e32635e5e57..f0bbdc261028 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -209,7 +209,8 @@ static void stop_func_tracer(int graph)
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 
 	if (!(bit & TRACE_DISPLAY_GRAPH))
@@ -311,7 +312,8 @@ __trace_function(struct trace_array *tr,
 #else
 #define __trace_function trace_function
 
-static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
+static int
+wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 {
 	return -EINVAL;
 }
-- 
cgit v1.3-14-g43fede


From bf6065b5c7014ab30383405718c7a6b96d2cbdb2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 10 Jan 2014 17:51:01 -0500
Subject: tracing: Pass trace_array to flag_changed callback

As options (flags) may affect instances instead of being global
the flag_changed() callbacks need to receive the trace_array descriptor
of the instance they will be modifying.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c              | 2 +-
 kernel/trace/trace.h              | 2 +-
 kernel/trace/trace_irqsoff.c      | 4 +++-
 kernel/trace/trace_sched_wakeup.c | 4 +++-
 4 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7dfc7efc4bf..ee8da93e91e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3393,7 +3393,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 
 	/* Give the tracer a chance to approve the change */
 	if (tr->current_trace->flag_changed)
-		if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
+		if (tr->current_trace->flag_changed(tr, mask, !!enabled))
 			return -EINVAL;
 
 	if (enabled)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 649a23d421c1..36e44732c650 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -358,7 +358,7 @@ struct tracer {
 	int			(*set_flag)(struct trace_array *tr,
 					    u32 old_flags, u32 bit, int set);
 	/* Return 0 if OK with change, else return non-zero */
-	int			(*flag_changed)(struct tracer *tracer,
+	int			(*flag_changed)(struct trace_array *tr,
 						u32 mask, int set);
 	struct tracer		*next;
 	struct tracer_flags	*flags;
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fd99b0c183ac..4bf812f454e6 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -572,8 +572,10 @@ static void irqsoff_function_set(int set)
 		unregister_irqsoff_function(is_graph());
 }
 
-static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
+static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
 {
+	struct tracer *tracer = tr->current_trace;
+
 	if (mask & TRACE_ITER_FUNCTION)
 		irqsoff_function_set(set);
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f0bbdc261028..e14da5e97a69 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -179,8 +179,10 @@ static void wakeup_function_set(int set)
 		unregister_wakeup_function(is_graph());
 }
 
-static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
+static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
 {
+	struct tracer *tracer = tr->current_trace;
+
 	if (mask & TRACE_ITER_FUNCTION)
 		wakeup_function_set(set);
 
-- 
cgit v1.3-14-g43fede


From 607e2ea167e56db84387f3ab97e59a862e101cab Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 6 Nov 2013 22:42:48 -0500
Subject: tracing: Set up infrastructure to allow tracers for instances

Currently the tracers (function, function_graph, irqsoff, etc) can only
be used by the top level tracing directory (not for instances).

This sets up the infrastructure to allow instances to be able to
run a separate tracer apart from the what the top level tracing is
doing.

As tracers need to adapt for being used by instances, the tracers
must flag if they can be used by instances or not. Currently only the
'nop' tracer can be used by all instances.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c     | 72 ++++++++++++++++++++++++++++++++++++++----------
 kernel/trace/trace.h     |  1 +
 kernel/trace/trace_nop.c |  3 +-
 3 files changed, 60 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee8da93e91e0..944cd021aabf 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -119,7 +119,7 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
 /* When set, tracing will stop when a WARN*() is hit */
 int __disable_trace_on_warning;
 
-static int tracing_set_tracer(const char *buf);
+static int tracing_set_tracer(struct trace_array *tr, const char *buf);
 
 #define MAX_TRACER_SIZE		100
 static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
@@ -1231,7 +1231,7 @@ int register_tracer(struct tracer *type)
 
 	printk(KERN_INFO "Starting tracer '%s'\n", type->name);
 	/* Do we want this tracer to start on bootup? */
-	tracing_set_tracer(type->name);
+	tracing_set_tracer(&global_trace, type->name);
 	default_bootup_tracer = NULL;
 	/* disable other selftests, since this will break it. */
 	tracing_selftest_disabled = true;
@@ -3122,27 +3122,52 @@ static int tracing_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+/*
+ * Some tracers are not suitable for instance buffers.
+ * A tracer is always available for the global array (toplevel)
+ * or if it explicitly states that it is.
+ */
+static bool
+trace_ok_for_array(struct tracer *t, struct trace_array *tr)
+{
+	return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
+}
+
+/* Find the next tracer that this trace array may use */
+static struct tracer *
+get_tracer_for_array(struct trace_array *tr, struct tracer *t)
+{
+	while (t && !trace_ok_for_array(t, tr))
+		t = t->next;
+
+	return t;
+}
+
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
+	struct trace_array *tr = m->private;
 	struct tracer *t = v;
 
 	(*pos)++;
 
 	if (t)
-		t = t->next;
+		t = get_tracer_for_array(tr, t->next);
 
 	return t;
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	struct trace_array *tr = m->private;
 	struct tracer *t;
 	loff_t l = 0;
 
 	mutex_lock(&trace_types_lock);
-	for (t = trace_types; t && l < *pos; t = t_next(m, t, &l))
-		;
+
+	t = get_tracer_for_array(tr, trace_types);
+	for (; t && l < *pos; t = t_next(m, t, &l))
+			;
 
 	return t;
 }
@@ -3177,10 +3202,21 @@ static const struct seq_operations show_traces_seq_ops = {
 
 static int show_traces_open(struct inode *inode, struct file *file)
 {
+	struct trace_array *tr = inode->i_private;
+	struct seq_file *m;
+	int ret;
+
 	if (tracing_disabled)
 		return -ENODEV;
 
-	return seq_open(file, &show_traces_seq_ops);
+	ret = seq_open(file, &show_traces_seq_ops);
+	if (ret)
+		return ret;
+
+	m = file->private_data;
+	m->private = tr;
+
+	return 0;
 }
 
 static ssize_t
@@ -3871,10 +3907,9 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
 static void
 destroy_trace_option_files(struct trace_option_dentry *topts);
 
-static int tracing_set_tracer(const char *buf)
+static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 {
 	static struct trace_option_dentry *topts;
-	struct trace_array *tr = &global_trace;
 	struct tracer *t;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	bool had_max_tr;
@@ -3902,6 +3937,12 @@ static int tracing_set_tracer(const char *buf)
 	if (t == tr->current_trace)
 		goto out;
 
+	/* Some tracers are only allowed for the top level buffer */
+	if (!trace_ok_for_array(t, tr)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	trace_branch_disable();
 
 	tr->current_trace->enabled = false;
@@ -3958,6 +3999,7 @@ static ssize_t
 tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
 {
+	struct trace_array *tr = filp->private_data;
 	char buf[MAX_TRACER_SIZE+1];
 	int i;
 	size_t ret;
@@ -3977,7 +4019,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
 		buf[i] = 0;
 
-	err = tracing_set_tracer(buf);
+	err = tracing_set_tracer(tr, buf);
 	if (err)
 		return err;
 
@@ -6193,6 +6235,12 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 {
 	int cpu;
 
+	trace_create_file("available_tracers", 0444, d_tracer,
+			tr, &show_traces_fops);
+
+	trace_create_file("current_tracer", 0644, d_tracer,
+			tr, &set_tracer_fops);
+
 	trace_create_file("tracing_cpumask", 0644, d_tracer,
 			  tr, &tracing_cpumask_fops);
 
@@ -6245,12 +6293,6 @@ static __init int tracer_init_debugfs(void)
 
 	init_tracer_debugfs(&global_trace, d_tracer);
 
-	trace_create_file("available_tracers", 0444, d_tracer,
-			&global_trace, &show_traces_fops);
-
-	trace_create_file("current_tracer", 0644, d_tracer,
-			&global_trace, &set_tracer_fops);
-
 #ifdef CONFIG_TRACER_MAX_TRACE
 	trace_create_file("tracing_max_latency", 0644, d_tracer,
 			&tracing_max_latency, &tracing_max_lat_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 36e44732c650..ea51bb2004d2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -364,6 +364,7 @@ struct tracer {
 	struct tracer_flags	*flags;
 	bool			print_max;
 	bool			enabled;
+	bool			allow_instances;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	bool			use_max_tr;
 #endif
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index f3984098c0d7..69a5cc94c01a 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -96,6 +96,7 @@ struct tracer nop_trace __read_mostly =
 	.selftest	= trace_selftest_startup_nop,
 #endif
 	.flags		= &nop_flags,
-	.set_flag	= nop_set_flag
+	.set_flag	= nop_set_flag,
+	.allow_instances = true,
 };
 
-- 
cgit v1.3-14-g43fede


From f1b21c9a40704dfdf7b8423c7d2969ea31c9857d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 14 Jan 2014 12:33:33 -0500
Subject: tracing: Only let top level have option files

Currently, only the top level instance can have tracing options.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 944cd021aabf..da9543cdbe7a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3968,9 +3968,11 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 		free_snapshot(tr);
 	}
 #endif
-	destroy_trace_option_files(topts);
-
-	topts = create_trace_option_files(tr, t);
+	/* Currently, only the top instance has options */
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+		destroy_trace_option_files(topts);
+		topts = create_trace_option_files(tr, t);
+	}
 
 #ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
-- 
cgit v1.3-14-g43fede


From e6435e96ec6f31a05690876a19e63e451f7b37e2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 10 Jan 2014 14:31:31 -0500
Subject: ftrace: Copy ops private to global_ops private

If global_ops function is being called directly, instead of the global_ops
list function, set the global_ops private to be the same as the ops private
that's being called directly.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cd7f76d1eb86..98ae4ed965db 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -244,7 +244,11 @@ static void control_ops_free(struct ftrace_ops *ops)
 
 static void update_global_ops(void)
 {
-	ftrace_func_t func;
+	ftrace_func_t func = ftrace_global_list_func;
+	void *private = NULL;
+
+	/* The list has its own recursion protection. */
+	global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
 
 	/*
 	 * If there's only one function registered, then call that
@@ -254,23 +258,17 @@ static void update_global_ops(void)
 	if (ftrace_global_list == &ftrace_list_end ||
 	    ftrace_global_list->next == &ftrace_list_end) {
 		func = ftrace_global_list->func;
+		private = ftrace_global_list->private;
 		/*
 		 * As we are calling the function directly.
 		 * If it does not have recursion protection,
 		 * the function_trace_op needs to be updated
 		 * accordingly.
 		 */
-		if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
-			global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
-		else
+		if (!(ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE))
 			global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
-	} else {
-		func = ftrace_global_list_func;
-		/* The list has its own recursion protection. */
-		global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
 	}
 
-
 	/* If we filter on pids, update to use the pid function */
 	if (!list_empty(&ftrace_pids)) {
 		set_ftrace_pid_function(func);
@@ -278,6 +276,7 @@ static void update_global_ops(void)
 	}
 
 	global_ops.func = func;
+	global_ops.private = private;
 }
 
 static void ftrace_sync(struct work_struct *work)
-- 
cgit v1.3-14-g43fede


From 6b450d2533e2c1c71fbe7f1bdce0bb1c9f813030 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 14 Jan 2014 08:43:01 -0500
Subject: tracing: Disable tracers before deletion of instance

When an instance is about to be deleted, make sure the tracer
is set to nop. If it isn't reset the tracer and set it to the nop
tracer, otherwise memory leaks and bad pointers may result.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index da9543cdbe7a..7d5913bb46e8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3907,6 +3907,23 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
 static void
 destroy_trace_option_files(struct trace_option_dentry *topts);
 
+/*
+ * Used to clear out the tracer before deletion of an instance.
+ * Must have trace_types_lock held.
+ */
+static void tracing_set_nop(struct trace_array *tr)
+{
+	if (tr->current_trace == &nop_trace)
+		return;
+	
+	tr->current_trace->enabled = false;
+
+	if (tr->current_trace->reset)
+		tr->current_trace->reset(tr);
+
+	tr->current_trace = &nop_trace;
+}
+
 static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 {
 	static struct trace_option_dentry *topts;
@@ -6142,6 +6159,7 @@ static int instance_delete(const char *name)
 
 	list_del(&tr->list);
 
+	tracing_set_nop(tr);
 	event_trace_del_tracer(tr);
 	debugfs_remove_recursive(tr->dir);
 	free_percpu(tr->trace_buffer.data);
-- 
cgit v1.3-14-g43fede


From 50512ab576e1ce29953c9259e1f36ce16f350f20 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 14 Jan 2014 08:52:35 -0500
Subject: tracing: Convert tracer->enabled to counter

As tracers will soon be used by instances, the tracer enabled field
needs to be converted to a counter instead of a boolean.
This counter is protected by the trace_types_lock mutex.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 kernel/trace/trace.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7d5913bb46e8..f9f22c435036 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3916,7 +3916,7 @@ static void tracing_set_nop(struct trace_array *tr)
 	if (tr->current_trace == &nop_trace)
 		return;
 	
-	tr->current_trace->enabled = false;
+	tr->current_trace->enabled--;
 
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
@@ -3962,7 +3962,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 
 	trace_branch_disable();
 
-	tr->current_trace->enabled = false;
+	tr->current_trace->enabled--;
 
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
@@ -4006,7 +4006,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
 	}
 
 	tr->current_trace = t;
-	tr->current_trace->enabled = true;
+	tr->current_trace->enabled++;
 	trace_branch_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ea51bb2004d2..86915b220bbe 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -362,8 +362,8 @@ struct tracer {
 						u32 mask, int set);
 	struct tracer		*next;
 	struct tracer_flags	*flags;
+	int			enabled;
 	bool			print_max;
-	bool			enabled;
 	bool			allow_instances;
 #ifdef CONFIG_TRACER_MAX_TRACE
 	bool			use_max_tr;
-- 
cgit v1.3-14-g43fede


From f20a580627f43e73e4e57cb37e3864080ca06088 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 7 Nov 2013 20:08:58 -0500
Subject: ftrace: Allow instances to use function tracing

Allow instances (sub-buffers) to enable function tracing.
Each instance will have its own function tracing capability.
For now, instances will not have function stack tracing, or will
they be able to pick and choose what functions they can trace.

Picking and choosing their own functions will come later.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.h           |   5 ++
 kernel/trace/trace_functions.c | 116 +++++++++++++++++++++++++++--------------
 2 files changed, 81 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86915b220bbe..35cca055da0f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -210,6 +210,11 @@ struct trace_array {
 	struct list_head	events;
 	cpumask_var_t		tracing_cpumask; /* only trace on set CPUs */
 	int			ref;
+#ifdef CONFIG_FUNCTION_TRACER
+	struct ftrace_ops	*ops;
+	/* function tracing enabled */
+	int			function_enabled;
+#endif
 };
 
 enum {
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 85e517e84f50..3f8dc1ce8b9c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -13,33 +13,83 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
+#include <linux/slab.h>
 #include <linux/fs.h>
 
 #include "trace.h"
 
-/* function tracing enabled */
-static int			ftrace_function_enabled;
+static void tracing_start_function_trace(struct trace_array *tr);
+static void tracing_stop_function_trace(struct trace_array *tr);
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+		    struct ftrace_ops *op, struct pt_regs *pt_regs);
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+			  struct ftrace_ops *op, struct pt_regs *pt_regs);
+static struct ftrace_ops trace_ops;
+static struct ftrace_ops trace_stack_ops;
+static struct tracer_flags func_flags;
+
+/* Our option */
+enum {
+	TRACE_FUNC_OPT_STACK	= 0x1,
+};
+
+static int allocate_ftrace_ops(struct trace_array *tr)
+{
+	struct ftrace_ops *ops;
 
-static struct trace_array	*func_trace;
+	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	if (!ops)
+		return -ENOMEM;
 
-static void tracing_start_function_trace(void);
-static void tracing_stop_function_trace(void);
+	/* Currently only the non stack verision is supported */
+	ops->func = function_trace_call;
+	ops->flags = FTRACE_OPS_FL_RECURSION_SAFE;
+
+	tr->ops = ops;
+	ops->private = tr;
+	return 0;
+}
 
 static int function_trace_init(struct trace_array *tr)
 {
-	func_trace = tr;
+	struct ftrace_ops *ops;
+	int ret;
+
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
+		/* There's only one global tr */
+		if (!trace_ops.private) {
+			trace_ops.private = tr;
+			trace_stack_ops.private = tr;
+		}
+
+		if (func_flags.val & TRACE_FUNC_OPT_STACK)
+			ops = &trace_stack_ops;
+		else
+			ops = &trace_ops;
+		tr->ops = ops;
+	} else {
+		ret = allocate_ftrace_ops(tr);
+		if (ret)
+			return ret;
+	}
+
 	tr->trace_buffer.cpu = get_cpu();
 	put_cpu();
 
 	tracing_start_cmdline_record();
-	tracing_start_function_trace();
+	tracing_start_function_trace(tr);
 	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	tracing_stop_function_trace();
+	tracing_stop_function_trace(tr);
 	tracing_stop_cmdline_record();
+	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
+		kfree(tr->ops);
+	tr->ops = NULL;
 }
 
 static void function_trace_start(struct trace_array *tr)
@@ -47,25 +97,18 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(&tr->trace_buffer);
 }
 
-/* Our option */
-enum {
-	TRACE_FUNC_OPT_STACK	= 0x1,
-};
-
-static struct tracer_flags func_flags;
-
 static void
 function_trace_call(unsigned long ip, unsigned long parent_ip,
 		    struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
-	struct trace_array *tr = func_trace;
+	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	int bit;
 	int cpu;
 	int pc;
 
-	if (unlikely(!ftrace_function_enabled))
+	if (unlikely(!tr->function_enabled))
 		return;
 
 	pc = preempt_count();
@@ -91,14 +134,14 @@ static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *op, struct pt_regs *pt_regs)
 {
-	struct trace_array *tr = func_trace;
+	struct trace_array *tr = op->private;
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
 	int cpu;
 	int pc;
 
-	if (unlikely(!ftrace_function_enabled))
+	if (unlikely(!tr->function_enabled))
 		return;
 
 	/*
@@ -128,7 +171,6 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	local_irq_restore(flags);
 }
 
-
 static struct ftrace_ops trace_ops __read_mostly =
 {
 	.func = function_trace_call,
@@ -153,26 +195,17 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
-static void tracing_start_function_trace(void)
+static void tracing_start_function_trace(struct trace_array *tr)
 {
-	ftrace_function_enabled = 0;
-
-	if (func_flags.val & TRACE_FUNC_OPT_STACK)
-		register_ftrace_function(&trace_stack_ops);
-	else
-		register_ftrace_function(&trace_ops);
-
-	ftrace_function_enabled = 1;
+	tr->function_enabled = 0;
+	register_ftrace_function(tr->ops);
+	tr->function_enabled = 1;
 }
 
-static void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(struct trace_array *tr)
 {
-	ftrace_function_enabled = 0;
-
-	if (func_flags.val & TRACE_FUNC_OPT_STACK)
-		unregister_ftrace_function(&trace_stack_ops);
-	else
-		unregister_ftrace_function(&trace_ops);
+	tr->function_enabled = 0;
+	unregister_ftrace_function(tr->ops);
 }
 
 static int
@@ -184,12 +217,14 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
 		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
 			break;
 
+		unregister_ftrace_function(tr->ops);
+
 		if (set) {
-			unregister_ftrace_function(&trace_ops);
-			register_ftrace_function(&trace_stack_ops);
+			tr->ops = &trace_stack_ops;
+			register_ftrace_function(tr->ops);
 		} else {
-			unregister_ftrace_function(&trace_stack_ops);
-			register_ftrace_function(&trace_ops);
+			tr->ops = &trace_ops;
+			register_ftrace_function(tr->ops);
 		}
 
 		break;
@@ -209,6 +244,7 @@ static struct tracer function_trace __tracer_data =
 	.wait_pipe	= poll_wait_pipe,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
+	.allow_instances = true,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_function,
 #endif
-- 
cgit v1.3-14-g43fede


From e3b3e2e847080e3cc14bee778c6ced3d59bfd76c Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Mon, 11 Nov 2013 23:07:14 -0500
Subject: ftrace: Pass in global_ops for use with filtering files

In preparation for having the function tracing instances be able to
filter on functions, the generic filter functions must first be
converted to take in the global_ops as a parameter.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 98ae4ed965db..2b3e23991c8a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2870,7 +2870,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 static int
 ftrace_filter_open(struct inode *inode, struct file *file)
 {
-	return ftrace_regex_open(&global_ops,
+	struct ftrace_ops *ops = inode->i_private;
+
+	return ftrace_regex_open(ops,
 			FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
 			inode, file);
 }
@@ -2878,7 +2880,9 @@ ftrace_filter_open(struct inode *inode, struct file *file)
 static int
 ftrace_notrace_open(struct inode *inode, struct file *file)
 {
-	return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
+	struct ftrace_ops *ops = inode->i_private;
+
+	return ftrace_regex_open(ops, FTRACE_ITER_NOTRACE,
 				 inode, file);
 }
 
@@ -4118,10 +4122,10 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			d_tracer, NULL, &ftrace_enabled_fops);
 
 	trace_create_file("set_ftrace_filter", 0644, d_tracer,
-			NULL, &ftrace_filter_fops);
+			&global_ops, &ftrace_filter_fops);
 
 	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
-				    NULL, &ftrace_notrace_fops);
+				    &global_ops, &ftrace_notrace_fops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	trace_create_file("set_graph_function", 0444, d_tracer,
-- 
cgit v1.3-14-g43fede


From 591dffdade9f07692a7dd3ed16830ec24e901ece Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 10 Jan 2014 16:17:45 -0500
Subject: ftrace: Allow for function tracing instance to filter functions

Create a "set_ftrace_filter" and "set_ftrace_notrace" files in the instance
directories to let users filter of functions to trace for the given instance.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h         |  2 ++
 kernel/trace/ftrace.c          | 39 ++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace.c           |  4 ++++
 kernel/trace/trace.h           | 25 ++++++++++++++++++++++++-
 kernel/trace/trace_functions.c | 40 ++++++++++++++++++++++++++++++++--------
 5 files changed, 96 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ef1607ed7044..e6141be2fad5 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -92,6 +92,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  * STUB   - The ftrace_ops is just a place holder.
  * INITIALIZED - The ftrace_ops has already been initialized (first use time
  *            register_ftrace_function() is called, it will initialized the ops)
+ * DELETED - The ops are being deleted, do not let them be registered again.
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -103,6 +104,7 @@ enum {
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
 	FTRACE_OPS_FL_STUB			= 1 << 7,
 	FTRACE_OPS_FL_INITIALIZED		= 1 << 8,
+	FTRACE_OPS_FL_DELETED			= 1 << 9,
 };
 
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2b3e23991c8a..dcee546f21bc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -436,6 +436,9 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
+	if (ops->flags & FTRACE_OPS_FL_DELETED)
+		return -EINVAL;
+
 	if (FTRACE_WARN_ON(ops == &global_ops))
 		return -EINVAL;
 
@@ -4112,6 +4115,36 @@ static const struct file_operations ftrace_graph_notrace_fops = {
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+				struct dentry *parent)
+{
+
+	trace_create_file("set_ftrace_filter", 0644, parent,
+			  ops, &ftrace_filter_fops);
+
+	trace_create_file("set_ftrace_notrace", 0644, parent,
+			  ops, &ftrace_notrace_fops);
+}
+
+/*
+ * The name "destroy_filter_files" is really a misnomer. Although
+ * in the future, it may actualy delete the files, but this is
+ * really intended to make sure the ops passed in are disabled
+ * and that when this function returns, the caller is free to
+ * free the ops.
+ *
+ * The "destroy" name is only to match the "create" name that this
+ * should be paired with.
+ */
+void ftrace_destroy_filter_files(struct ftrace_ops *ops)
+{
+	mutex_lock(&ftrace_lock);
+	if (ops->flags & FTRACE_OPS_FL_ENABLED)
+		ftrace_shutdown(ops, 0);
+	ops->flags |= FTRACE_OPS_FL_DELETED;
+	mutex_unlock(&ftrace_lock);
+}
+
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
 
@@ -4121,11 +4154,7 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 	trace_create_file("enabled_functions", 0444,
 			d_tracer, NULL, &ftrace_enabled_fops);
 
-	trace_create_file("set_ftrace_filter", 0644, d_tracer,
-			&global_ops, &ftrace_filter_fops);
-
-	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
-				    &global_ops, &ftrace_notrace_fops);
+	ftrace_create_filter_files(&global_ops, d_tracer);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	trace_create_file("set_graph_function", 0444, d_tracer,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f9f22c435036..d95ec2876bbb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6161,6 +6161,7 @@ static int instance_delete(const char *name)
 
 	tracing_set_nop(tr);
 	event_trace_del_tracer(tr);
+	ftrace_destroy_function_files(tr);
 	debugfs_remove_recursive(tr->dir);
 	free_percpu(tr->trace_buffer.data);
 	ring_buffer_free(tr->trace_buffer.buffer);
@@ -6291,6 +6292,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
 	trace_create_file("tracing_on", 0644, d_tracer,
 			  tr, &rb_simple_fops);
 
+	if (ftrace_create_function_files(tr, d_tracer))
+		WARN(1, "Could not allocate function filter files");
+
 #ifdef CONFIG_TRACER_SNAPSHOT
 	trace_create_file("snapshot", 0644, d_tracer,
 			  tr, &snapshot_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 35cca055da0f..ffc314b7e92b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -819,13 +819,36 @@ static inline int ftrace_trace_task(struct task_struct *task)
 	return test_tsk_trace_trace(task);
 }
 extern int ftrace_is_dead(void);
+int ftrace_create_function_files(struct trace_array *tr,
+				 struct dentry *parent);
+void ftrace_destroy_function_files(struct trace_array *tr);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
 	return 1;
 }
 static inline int ftrace_is_dead(void) { return 0; }
-#endif
+static inline int
+ftrace_create_function_files(struct trace_array *tr,
+			     struct dentry *parent)
+{
+	return 0;
+}
+static inline void ftrace_destroy_function_files(struct trace_array *tr) { }
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+void ftrace_create_filter_files(struct ftrace_ops *ops,
+				struct dentry *parent);
+void ftrace_destroy_filter_files(struct ftrace_ops *ops);
+#else
+/*
+ * The ops parameter passed in is usually undefined.
+ * This must be a macro.
+ */
+#define ftrace_create_filter_files(ops, parent) do { } while (0)
+#define ftrace_destroy_filter_files(ops) do { } while (0)
+#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
 
 int ftrace_event_is_function(struct ftrace_event_call *call);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3f8dc1ce8b9c..5b781d2be383 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -52,10 +52,34 @@ static int allocate_ftrace_ops(struct trace_array *tr)
 	return 0;
 }
 
+
+int ftrace_create_function_files(struct trace_array *tr,
+				 struct dentry *parent)
+{
+	int ret;
+
+	/* The top level array uses the "global_ops". */
+	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) {
+		ret = allocate_ftrace_ops(tr);
+		if (ret)
+			return ret;
+	}
+
+	ftrace_create_filter_files(tr->ops, parent);
+
+	return 0;
+}
+
+void ftrace_destroy_function_files(struct trace_array *tr)
+{
+	ftrace_destroy_filter_files(tr->ops);
+	kfree(tr->ops);
+	tr->ops = NULL;
+}
+
 static int function_trace_init(struct trace_array *tr)
 {
 	struct ftrace_ops *ops;
-	int ret;
 
 	if (tr->flags & TRACE_ARRAY_FL_GLOBAL) {
 		/* There's only one global tr */
@@ -69,10 +93,13 @@ static int function_trace_init(struct trace_array *tr)
 		else
 			ops = &trace_ops;
 		tr->ops = ops;
-	} else {
-		ret = allocate_ftrace_ops(tr);
-		if (ret)
-			return ret;
+	} else if (!tr->ops) {
+		/*
+		 * Instance trace_arrays get their ops allocated
+		 * at instance creation. Unless it failed
+		 * the allocation.
+		 */
+		return -ENOMEM;
 	}
 
 	tr->trace_buffer.cpu = get_cpu();
@@ -87,9 +114,6 @@ static void function_trace_reset(struct trace_array *tr)
 {
 	tracing_stop_function_trace(tr);
 	tracing_stop_cmdline_record();
-	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
-		kfree(tr->ops);
-	tr->ops = NULL;
 }
 
 static void function_trace_start(struct trace_array *tr)
-- 
cgit v1.3-14-g43fede


From a43b97043048eac1686f409af7ad3bb8071b9d83 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 17 Jan 2014 17:08:36 +0900
Subject: tracing/uprobes: Rename uprobe_{trace,perf}_print() functions

The uprobe_{trace,perf}_print functions are misnomers since what they
do is not printing.  There's also a real print function named
print_uprobe_event() so they'll only increase confusion IMHO.

Rename them with double underscores to follow convention of kprobe.

Link: http://lkml.kernel.org/r/1389946120-19610-2-git-send-email-namhyung@kernel.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 79e52d93860b..c5d2612bf233 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -758,7 +758,7 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 	mutex_unlock(&ucb->mutex);
 }
 
-static void uprobe_trace_print(struct trace_uprobe *tu,
+static void __uprobe_trace_func(struct trace_uprobe *tu,
 				unsigned long func, struct pt_regs *regs)
 {
 	struct uprobe_trace_entry_head *entry;
@@ -807,14 +807,14 @@ out:
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
 {
 	if (!is_ret_probe(tu))
-		uprobe_trace_print(tu, 0, regs);
+		__uprobe_trace_func(tu, 0, regs);
 	return 0;
 }
 
 static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 				struct pt_regs *regs)
 {
-	uprobe_trace_print(tu, func, regs);
+	__uprobe_trace_func(tu, func, regs);
 }
 
 /* Event entry printers */
@@ -1014,7 +1014,7 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 	return ret;
 }
 
-static void uprobe_perf_print(struct trace_uprobe *tu,
+static void __uprobe_perf_func(struct trace_uprobe *tu,
 				unsigned long func, struct pt_regs *regs)
 {
 	struct ftrace_event_call *call = &tu->tp.call;
@@ -1078,14 +1078,14 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
 		return UPROBE_HANDLER_REMOVE;
 
 	if (!is_ret_probe(tu))
-		uprobe_perf_print(tu, 0, regs);
+		__uprobe_perf_func(tu, 0, regs);
 	return 0;
 }
 
 static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 				struct pt_regs *regs)
 {
-	uprobe_perf_print(tu, func, regs);
+	__uprobe_perf_func(tu, func, regs);
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
-- 
cgit v1.3-14-g43fede


From dd9fa555d7bbfcc7dbc63eb744806e9f6cb62e9f Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 17 Jan 2014 17:08:37 +0900
Subject: tracing/uprobes: Move argument fetching to uprobe_dispatcher()

A single uprobe event might serve different users like ftrace and
perf.  And this is especially important for upcoming multi buffer
support.  But in this case it'll fetch (same) data from userspace
multiple times.  So move it to the beginning of the dispatcher
function and reuse it for each users.

Link: http://lkml.kernel.org/r/1389946120-19610-3-git-send-email-namhyung@kernel.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 93 +++++++++++++++++++++++++++------------------
 1 file changed, 56 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c5d2612bf233..d83155e0da78 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -759,30 +759,25 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 }
 
 static void __uprobe_trace_func(struct trace_uprobe *tu,
-				unsigned long func, struct pt_regs *regs)
+				unsigned long func, struct pt_regs *regs,
+				struct uprobe_cpu_buffer *ucb, int dsize)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
-	struct uprobe_cpu_buffer *ucb;
 	void *data;
-	int size, dsize, esize;
+	int size, esize;
 	struct ftrace_event_call *call = &tu->tp.call;
 
-	dsize = __get_data_size(&tu->tp, regs);
-	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
-
-	if (WARN_ON_ONCE(!uprobe_cpu_buffer || tu->tp.size + dsize > PAGE_SIZE))
+	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
-	ucb = uprobe_buffer_get();
-	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
-
+	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 						  size, 0, 0);
 	if (!event)
-		goto out;
+		return;
 
 	entry = ring_buffer_event_data(event);
 	if (is_ret_probe(tu)) {
@@ -798,23 +793,22 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	if (!call_filter_check_discard(call, entry, buffer, event))
 		trace_buffer_unlock_commit(buffer, event, 0, 0);
-
-out:
-	uprobe_buffer_put(ucb);
 }
 
 /* uprobe handler */
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
+			     struct uprobe_cpu_buffer *ucb, int dsize)
 {
 	if (!is_ret_probe(tu))
-		__uprobe_trace_func(tu, 0, regs);
+		__uprobe_trace_func(tu, 0, regs, ucb, dsize);
 	return 0;
 }
 
 static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
-				struct pt_regs *regs)
+				 struct pt_regs *regs,
+				 struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	__uprobe_trace_func(tu, func, regs);
+	__uprobe_trace_func(tu, func, regs, ucb, dsize);
 }
 
 /* Event entry printers */
@@ -1015,30 +1009,23 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 }
 
 static void __uprobe_perf_func(struct trace_uprobe *tu,
-				unsigned long func, struct pt_regs *regs)
+			       unsigned long func, struct pt_regs *regs,
+			       struct uprobe_cpu_buffer *ucb, int dsize)
 {
 	struct ftrace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
-	struct uprobe_cpu_buffer *ucb;
 	void *data;
-	int size, dsize, esize;
+	int size, esize;
 	int rctx;
 
-	dsize = __get_data_size(&tu->tp, regs);
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 
-	if (WARN_ON_ONCE(!uprobe_cpu_buffer))
-		return;
-
 	size = esize + tu->tp.size + dsize;
 	size = ALIGN(size + sizeof(u32), sizeof(u64)) - sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
 		return;
 
-	ucb = uprobe_buffer_get();
-	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
-
 	preempt_disable();
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
@@ -1068,24 +1055,25 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
  out:
 	preempt_enable();
-	uprobe_buffer_put(ucb);
 }
 
 /* uprobe profile handler */
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
+			    struct uprobe_cpu_buffer *ucb, int dsize)
 {
 	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
 		return UPROBE_HANDLER_REMOVE;
 
 	if (!is_ret_probe(tu))
-		__uprobe_perf_func(tu, 0, regs);
+		__uprobe_perf_func(tu, 0, regs, ucb, dsize);
 	return 0;
 }
 
 static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
-				struct pt_regs *regs)
+				struct pt_regs *regs,
+				struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	__uprobe_perf_func(tu, func, regs);
+	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
@@ -1127,8 +1115,11 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
+	struct uprobe_cpu_buffer *ucb;
+	int dsize, esize;
 	int ret = 0;
 
+
 	tu = container_of(con, struct trace_uprobe, consumer);
 	tu->nhit++;
 
@@ -1137,13 +1128,29 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 
 	current->utask->vaddr = (unsigned long) &udd;
 
+#ifdef CONFIG_PERF_EVENTS
+	if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
+	    !uprobe_perf_filter(&tu->consumer, 0, current->mm))
+		return UPROBE_HANDLER_REMOVE;
+#endif
+
+	if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+		return 0;
+
+	dsize = __get_data_size(&tu->tp, regs);
+	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+	ucb = uprobe_buffer_get();
+	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
 	if (tu->tp.flags & TP_FLAG_TRACE)
-		ret |= uprobe_trace_func(tu, regs);
+		ret |= uprobe_trace_func(tu, regs, ucb, dsize);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (tu->tp.flags & TP_FLAG_PROFILE)
-		ret |= uprobe_perf_func(tu, regs);
+		ret |= uprobe_perf_func(tu, regs, ucb, dsize);
 #endif
+	uprobe_buffer_put(ucb);
 	return ret;
 }
 
@@ -1152,6 +1159,8 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
 {
 	struct trace_uprobe *tu;
 	struct uprobe_dispatch_data udd;
+	struct uprobe_cpu_buffer *ucb;
+	int dsize, esize;
 
 	tu = container_of(con, struct trace_uprobe, consumer);
 
@@ -1160,13 +1169,23 @@ static int uretprobe_dispatcher(struct uprobe_consumer *con,
 
 	current->utask->vaddr = (unsigned long) &udd;
 
+	if (WARN_ON_ONCE(!uprobe_cpu_buffer))
+		return 0;
+
+	dsize = __get_data_size(&tu->tp, regs);
+	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+
+	ucb = uprobe_buffer_get();
+	store_trace_args(esize, &tu->tp, regs, ucb->buf, dsize);
+
 	if (tu->tp.flags & TP_FLAG_TRACE)
-		uretprobe_trace_func(tu, func, regs);
+		uretprobe_trace_func(tu, func, regs, ucb, dsize);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (tu->tp.flags & TP_FLAG_PROFILE)
-		uretprobe_perf_func(tu, func, regs);
+		uretprobe_perf_func(tu, func, regs, ucb, dsize);
 #endif
+	uprobe_buffer_put(ucb);
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From 70ed91c6ec7f8bf20369634017d887d48ac979d2 Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Fri, 17 Jan 2014 17:08:38 +0900
Subject: tracing/uprobes: Support ftrace_event_file base multibuffer

Support multi-buffer on uprobe-based dynamic events by
using ftrace_event_file.

This patch is based kprobe-based dynamic events multibuffer
support work initially, commited by Masami(commit 41a7dd420c),
but revised as below:

Oleg changed the kprobe-based multibuffer design from
array-pointers of ftrace_event_file into simple list,
so this patch also change to the list design.

rcu_read_lock/unlock added into uprobe_trace_func/uretprobe_trace_func,
to synchronize with ftrace_event_file list add and delete.

Even though we allow multi-uprobes instances now,
but TP_FLAG_PROFILE/TP_FLAG_TRACE are still mutually exclusive
in probe_event_enable currently, this means we cannot allow
one user is using uprobe-tracer, and another user is using
perf-probe on same uprobe concurrently.
(Perhaps this will be fix in future, kprobe don't have this
limitation now)

Link: http://lkml.kernel.org/r/1389946120-19610-4-git-send-email-namhyung@kernel.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_kprobe.c |  17 -------
 kernel/trace/trace_probe.h  |  17 +++++++
 kernel/trace/trace_uprobe.c | 105 +++++++++++++++++++++++++++++++++++---------
 3 files changed, 101 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index bdbae450c13e..d021d21dd150 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,11 +35,6 @@ struct trace_kprobe {
 	struct trace_probe	tp;
 };
 
-struct event_file_link {
-	struct ftrace_event_file	*file;
-	struct list_head		list;
-};
-
 #define SIZEOF_TRACE_KPROBE(n)				\
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
@@ -387,18 +382,6 @@ enable_trace_kprobe(struct trace_kprobe *tk, struct ftrace_event_file *file)
 	return ret;
 }
 
-static struct event_file_link *
-find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
-{
-	struct event_file_link *link;
-
-	list_for_each_entry(link, &tp->files, list)
-		if (link->file == file)
-			return link;
-
-	return NULL;
-}
-
 /*
  * Disable trace_probe
  * if the file is NULL, disable "perf" handler, or disable "trace" handler.
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b73574a5f429..fb1ab5dfbd42 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -288,6 +288,11 @@ struct trace_probe {
 	struct probe_arg		args[];
 };
 
+struct event_file_link {
+	struct ftrace_event_file	*file;
+	struct list_head		list;
+};
+
 static inline bool trace_probe_is_enabled(struct trace_probe *tp)
 {
 	return !!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE));
@@ -316,6 +321,18 @@ static inline int is_good_name(const char *name)
 	return 1;
 }
 
+static inline struct event_file_link *
+find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file)
+{
+	struct event_file_link *link;
+
+	list_for_each_entry(link, &tp->files, list)
+		if (link->file == file)
+			return link;
+
+	return NULL;
+}
+
 extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size,
 		   struct probe_arg *parg, bool is_return, bool is_kprobe);
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index d83155e0da78..349c6df9e332 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -260,6 +260,7 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 		goto error;
 
 	INIT_LIST_HEAD(&tu->list);
+	INIT_LIST_HEAD(&tu->tp.files);
 	tu->consumer.handler = uprobe_dispatcher;
 	if (is_ret)
 		tu->consumer.ret_handler = uretprobe_dispatcher;
@@ -760,7 +761,8 @@ static void uprobe_buffer_put(struct uprobe_cpu_buffer *ucb)
 
 static void __uprobe_trace_func(struct trace_uprobe *tu,
 				unsigned long func, struct pt_regs *regs,
-				struct uprobe_cpu_buffer *ucb, int dsize)
+				struct uprobe_cpu_buffer *ucb, int dsize,
+				struct ftrace_event_file *ftrace_file)
 {
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
@@ -769,13 +771,15 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	int size, esize;
 	struct ftrace_event_call *call = &tu->tp.call;
 
+	WARN_ON(call != ftrace_file->event_call);
+
 	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
-	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, 0, 0);
+	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
+						call->event.type, size, 0, 0);
 	if (!event)
 		return;
 
@@ -799,8 +803,16 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs,
 			     struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	if (!is_ret_probe(tu))
-		__uprobe_trace_func(tu, 0, regs, ucb, dsize);
+	struct event_file_link *link;
+
+	if (is_ret_probe(tu))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(link, &tu->tp.files, list)
+		__uprobe_trace_func(tu, 0, regs, ucb, dsize, link->file);
+	rcu_read_unlock();
+
 	return 0;
 }
 
@@ -808,7 +820,12 @@ static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
 				 struct pt_regs *regs,
 				 struct uprobe_cpu_buffer *ucb, int dsize)
 {
-	__uprobe_trace_func(tu, func, regs, ucb, dsize);
+	struct event_file_link *link;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(link, &tu->tp.files, list)
+		__uprobe_trace_func(tu, func, regs, ucb, dsize, link->file);
+	rcu_read_unlock();
 }
 
 /* Event entry printers */
@@ -855,12 +872,31 @@ typedef bool (*filter_func_t)(struct uprobe_consumer *self,
 				struct mm_struct *mm);
 
 static int
-probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
+probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
+		   filter_func_t filter)
 {
-	int ret = 0;
+	bool enabled = trace_probe_is_enabled(&tu->tp);
+	struct event_file_link *link = NULL;
+	int ret;
+
+	if (file) {
+		if (tu->tp.flags & TP_FLAG_PROFILE)
+			return -EINTR;
 
-	if (trace_probe_is_enabled(&tu->tp))
-		return -EINTR;
+		link = kmalloc(sizeof(*link), GFP_KERNEL);
+		if (!link)
+			return -ENOMEM;
+
+		link->file = file;
+		list_add_tail_rcu(&link->list, &tu->tp.files);
+
+		tu->tp.flags |= TP_FLAG_TRACE;
+	} else {
+		if (tu->tp.flags & TP_FLAG_TRACE)
+			return -EINTR;
+
+		tu->tp.flags |= TP_FLAG_PROFILE;
+	}
 
 	ret = uprobe_buffer_enable();
 	if (ret < 0)
@@ -868,24 +904,49 @@ probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
 
 	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
-	tu->tp.flags |= flag;
+	if (enabled)
+		return 0;
+
 	tu->consumer.filter = filter;
 	ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
-	if (ret)
-		tu->tp.flags &= ~flag;
+	if (ret) {
+		if (file) {
+			list_del(&link->list);
+			kfree(link);
+			tu->tp.flags &= ~TP_FLAG_TRACE;
+		} else
+			tu->tp.flags &= ~TP_FLAG_PROFILE;
+	}
 
 	return ret;
 }
 
-static void probe_event_disable(struct trace_uprobe *tu, int flag)
+static void
+probe_event_disable(struct trace_uprobe *tu, struct ftrace_event_file *file)
 {
 	if (!trace_probe_is_enabled(&tu->tp))
 		return;
 
+	if (file) {
+		struct event_file_link *link;
+
+		link = find_event_file_link(&tu->tp, file);
+		if (!link)
+			return;
+
+		list_del_rcu(&link->list);
+		/* synchronize with u{,ret}probe_trace_func */
+		synchronize_sched();
+		kfree(link);
+
+		if (!list_empty(&tu->tp.files))
+			return;
+	}
+
 	WARN_ON(!uprobe_filter_is_empty(&tu->filter));
 
 	uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
-	tu->tp.flags &= ~flag;
+	tu->tp.flags &= file ? ~TP_FLAG_TRACE : ~TP_FLAG_PROFILE;
 
 	uprobe_buffer_disable();
 }
@@ -1077,25 +1138,27 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 }
 #endif	/* CONFIG_PERF_EVENTS */
 
-static
-int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
+static int
+trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
+		      void *data)
 {
 	struct trace_uprobe *tu = event->data;
+	struct ftrace_event_file *file = data;
 
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
+		return probe_event_enable(tu, file, NULL);
 
 	case TRACE_REG_UNREGISTER:
-		probe_event_disable(tu, TP_FLAG_TRACE);
+		probe_event_disable(tu, file);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
 	case TRACE_REG_PERF_REGISTER:
-		return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
+		return probe_event_enable(tu, NULL, uprobe_perf_filter);
 
 	case TRACE_REG_PERF_UNREGISTER:
-		probe_event_disable(tu, TP_FLAG_PROFILE);
+		probe_event_disable(tu, NULL);
 		return 0;
 
 	case TRACE_REG_PERF_OPEN:
-- 
cgit v1.3-14-g43fede


From ca3b162021a421b38a9cd7b66555b9b01568dc9d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 17 Jan 2014 17:08:39 +0900
Subject: tracing/uprobes: Support event triggering

Add support for event triggering to uprobes.  This is same as kprobes
support added by Tom (plus cleanup by Steven).

Link: http://lkml.kernel.org/r/1389946120-19610-5-git-send-email-namhyung@kernel.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 349c6df9e332..01fcb0db75cb 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -776,6 +776,9 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 	if (WARN_ON_ONCE(tu->tp.size + dsize > PAGE_SIZE))
 		return;
 
+	if (ftrace_trigger_soft_disabled(ftrace_file))
+		return;
+
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	size = esize + tu->tp.size + dsize;
 	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
@@ -795,8 +798,7 @@ static void __uprobe_trace_func(struct trace_uprobe *tu,
 
 	memcpy(data, ucb->buf, tu->tp.size + dsize);
 
-	if (!call_filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
+	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, 0, 0);
 }
 
 /* uprobe handler */
-- 
cgit v1.3-14-g43fede


From 43fe98913c9f67e3b523615ee3316f9520a623e0 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@kernel.org>
Date: Fri, 17 Jan 2014 17:08:40 +0900
Subject: tracing/uprobes: Support mix of ftrace and perf

It seems there's no reason to prevent mixed used of ftrace and perf
for a single uprobe event.  At least the kprobes already support it.

Link: http://lkml.kernel.org/r/1389946120-19610-6-git-send-email-namhyung@kernel.org

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_uprobe.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 01fcb0db75cb..e4473367e7a4 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -882,9 +882,6 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
 	int ret;
 
 	if (file) {
-		if (tu->tp.flags & TP_FLAG_PROFILE)
-			return -EINTR;
-
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link)
 			return -ENOMEM;
@@ -893,12 +890,8 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
 		list_add_tail_rcu(&link->list, &tu->tp.files);
 
 		tu->tp.flags |= TP_FLAG_TRACE;
-	} else {
-		if (tu->tp.flags & TP_FLAG_TRACE)
-			return -EINTR;
-
+	} else
 		tu->tp.flags |= TP_FLAG_PROFILE;
-	}
 
 	ret = uprobe_buffer_enable();
 	if (ret < 0)
-- 
cgit v1.3-14-g43fede


From e1e232ca6b8faa210e5509f17d55519b4392524f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 10 Feb 2014 23:38:46 -0500
Subject: tracing: Add trace_clock=<clock> kernel parameter

Being able to change the trace clock at boot can be advantageous if
you need a better source of when things happen across CPUs. The default
trace clock is the fastest, but it uses local clocks which may not be
synced across CPUs and it does not let you know when events took place
with respect to events on other CPUs.

The global trace clock can help in this case, and if you do not care
about timings, the counter "clock" is the best, as that is just a  simple
atomic counter that is incremented for every event.

Usage is to add "trace_clock=counter" on the kernel command line. You
can replace counter with "global" or any of the clocks listed in
/sys/kernel/debug/tracing/trace_clock

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Appreciated-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 61 ++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d95ec2876bbb..c90f55d80f86 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -181,6 +181,17 @@ static int __init set_trace_boot_options(char *str)
 }
 __setup("trace_options=", set_trace_boot_options);
 
+static char trace_boot_clock_buf[MAX_TRACER_SIZE] __initdata;
+static char *trace_boot_clock __initdata;
+
+static int __init set_trace_boot_clock(char *str)
+{
+	strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
+	trace_boot_clock = trace_boot_clock_buf;
+	return 0;
+}
+__setup("trace_clock=", set_trace_boot_clock);
+
 
 unsigned long long ns2usecs(cycle_t nsec)
 {
@@ -4746,25 +4757,10 @@ static int tracing_clock_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
-				   size_t cnt, loff_t *fpos)
+static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
 {
-	struct seq_file *m = filp->private_data;
-	struct trace_array *tr = m->private;
-	char buf[64];
-	const char *clockstr;
 	int i;
 
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	clockstr = strstrip(buf);
-
 	for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
 		if (strcmp(trace_clocks[i].name, clockstr) == 0)
 			break;
@@ -4792,6 +4788,32 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 
 	mutex_unlock(&trace_types_lock);
 
+	return 0;
+}
+
+static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *fpos)
+{
+	struct seq_file *m = filp->private_data;
+	struct trace_array *tr = m->private;
+	char buf[64];
+	const char *clockstr;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	clockstr = strstrip(buf);
+
+	ret = tracing_set_clock(tr, clockstr);
+	if (ret)
+		return ret;
+
 	*fpos += cnt;
 
 	return cnt;
@@ -6574,6 +6596,13 @@ __init static int tracer_alloc_buffers(void)
 
 	trace_init_cmdlines();
 
+	if (trace_boot_clock) {
+		ret = tracing_set_clock(&global_trace, trace_boot_clock);
+		if (ret < 0)
+			pr_warning("Trace clock %s not defined, going back to default\n",
+				   trace_boot_clock);
+	}
+
 	/*
 	 * register_tracer() might reference current_trace, so it
 	 * needs to be set before we register anything. This is
-- 
cgit v1.3-14-g43fede


From 1fcc155351f183e5044180eeb372a8ff47710855 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 19 Feb 2014 15:12:18 -0500
Subject: ftrace: Have static function trace clear ENABLED flag on unregister

The ENABLED flag needs to be cleared when a ftrace_ops is unregistered
otherwise it wont be able to be registered again.

This is only for static tracing and does not affect DYNAMIC_FTRACE at
all.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dcee546f21bc..5313c1100d30 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4463,7 +4463,13 @@ static inline void ftrace_startup_enable(int command) { }
 			(ops)->flags |= FTRACE_OPS_FL_ENABLED;		\
 		___ret;							\
 	})
-# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
+# define ftrace_shutdown(ops, command)					\
+	({								\
+		int ___ret = __unregister_ftrace_function(ops);		\
+		if (!___ret)						\
+			(ops)->flags &= ~FTRACE_OPS_FL_ENABLED;		\
+		___ret;							\
+	})
 
 # define ftrace_startup_sysctl()	do { } while (0)
 # define ftrace_shutdown_sysctl()	do { } while (0)
-- 
cgit v1.3-14-g43fede


From b080e047a61f7050246ff3081f87832997170d29 Mon Sep 17 00:00:00 2001
From: Brian Campbell <brian.campbell@editshare.com>
Date: Sun, 16 Feb 2014 22:58:12 -0500
Subject: user_namespace.c: Remove duplicated word in comment

Signed-off-by: Brian Campbell <brian.campbell@editshare.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf394..dd06439b9c84 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
  *
  *	When there is no mapping defined for the user-namespace uid
  *	pair INVALID_UID is returned.  Callers are expected to test
- *	for and handle handle INVALID_UID being returned.  INVALID_UID
+ *	for and handle INVALID_UID being returned.  INVALID_UID
  *	may be tested for using uid_valid().
  */
 kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
-- 
cgit v1.3-14-g43fede


From a53efe5ff88d0283bae8a2c2fa066d0fff31dc91 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 26 Oct 2012 17:17:44 +0200
Subject: sched/mm: call finish_arch_post_lock_switch in idle_task_exit and
 use_mm

The finish_arch_post_lock_switch is called at the end of the task
switch after all locks have been released. In concept it is paired
with the switch_mm function, but the current code only does the
call in finish_task_switch. Add the call to idle_task_exit and
use_mm. One use case for the additional calls is s390 which will
use finish_arch_post_lock_switch to wait for the completion of
TLB flush operations.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/sched/core.c | 4 +++-
 mm/mmu_context.c    | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..4b0739c9558e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4692,8 +4692,10 @@ void idle_task_exit(void)
 
 	BUG_ON(cpu_online(smp_processor_id()));
 
-	if (mm != &init_mm)
+	if (mm != &init_mm) {
 		switch_mm(mm, &init_mm, current);
+		finish_arch_post_lock_switch();
+	}
 	mmdrop(mm);
 }
 
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 8a8cd0265e52..f802c2d216a7 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm)
 	tsk->mm = mm;
 	switch_mm(active_mm, mm, tsk);
 	task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+	finish_arch_post_lock_switch();
+#endif
 
 	if (active_mm != mm)
 		mmdrop(active_mm);
-- 
cgit v1.3-14-g43fede


From 3d5f35bdfdef5fd627afe9b4bf9c4f32d17f4593 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Thu, 20 Feb 2014 09:19:39 +0100
Subject: sched/deadline: Fix bad accounting of nr_running

Rostedt writes:

My test suite was locking up hard when enabling mmiotracer. This was due
to the mmiotracer placing all but one CPU offline. I found this out
when I was able to reproduce the bug with just my stress-cpu-hotplug
test. This bug baffled me because it would not always trigger, and
would only trigger on the first run after boot up. The
stress-cpu-hotplug test would crash hard the first run, or never crash
at all. But a new reboot may cause it to crash on the first run again.

I spent all week bisecting this, as I couldn't find a consistent
reproducer. I finally narrowed it down to the sched deadline patches,
and even more peculiar, to the commit that added the sched
deadline boot up self test to the latency tracer. Then it dawned on me
to what the bug was.

All it took was to run a task under sched deadline to screw up the CPU
hot plugging. This explained why it would lock up only on the first run
of the stress-cpu-hotplug test. The bug happened when the boot up self
test of the schedule latency tracer would test a deadline task. The
deadline task would corrupt something that would cause CPU hotplug to
fail. If it didn't corrupt it, the stress test would always work
(there's no other sched deadline tasks that would run to cause
problems). If it did corrupt on boot up, the first test would lockup
hard.

I proved this theory by running my deadline test program on another box,
and then run the stress-cpu-hotplug test, and it would now consistently
lock up. I could run stress-cpu-hotplug over and over with no problem,
but once I ran the deadline test, the next run of the
stress-cpu-hotplug would lock hard.

After adding lots of tracing to the code, I found the cause. The
function tracer showed that migrate_tasks() was stuck in an infinite
loop, where rq->nr_running never equaled 1 to break out of it. When I
added a trace_printk() to see what that number was, it was 335 and
never decrementing!

Looking at the deadline code I found:

static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) {
	dequeue_dl_entity(&p->dl);
	dequeue_pushable_dl_task(rq, p);
}

static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) {
	update_curr_dl(rq);
	__dequeue_task_dl(rq, p, flags);

	dec_nr_running(rq);
}

And this:

	if (dl_runtime_exceeded(rq, dl_se)) {
		__dequeue_task_dl(rq, curr, 0);
		if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
			dl_se->dl_throttled = 1;
		else
			enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);

		if (!is_leftmost(curr, &rq->dl))
			resched_task(curr);
	}

Notice how we call __dequeue_task_dl() and in the else case we
call enqueue_task_dl()? Also notice that dequeue_task_dl() has
underscores where enqueue_task_dl() does not. The enqueue_task_dl()
calls inc_nr_running(rq), but __dequeue_task_dl() does not. This is
where we get nr_running out of sync.

[snip]

Another point where nr_running can get out of sync is when the dl_timer
fires:

	dl_se->dl_throttled = 0;
	if (p->on_rq) {
		enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
		if (task_has_dl_policy(rq->curr))
			check_preempt_curr_dl(rq, p, 0);
		else
			resched_task(rq->curr);

This patch does two things:

 - correctly accounts for throttled tasks (that are now considered
   !running);

 - fixes the bug, updating nr_running from {inc,dec}_dl_tasks(),
   since we risk to update it twice in some situations (e.g., a
   task is dequeued while it has exceeded its budget).

Cc: mingo@redhat.com
Cc: torvalds@linux-foundation.org
Cc: akpm@linux-foundation.org
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392884379-13744-1-git-send-email-juri.lelli@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..b819577c21de 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -717,6 +717,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 
 	WARN_ON(!dl_prio(prio));
 	dl_rq->dl_nr_running++;
+	inc_nr_running(rq_of_dl_rq(dl_rq));
 
 	inc_dl_deadline(dl_rq, deadline);
 	inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +731,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	WARN_ON(!dl_prio(prio));
 	WARN_ON(!dl_rq->dl_nr_running);
 	dl_rq->dl_nr_running--;
+	dec_nr_running(rq_of_dl_rq(dl_rq));
 
 	dec_dl_deadline(dl_rq, dl_se->deadline);
 	dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +838,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
 		enqueue_pushable_dl_task(rq, p);
-
-	inc_nr_running(rq);
 }
 
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +850,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 {
 	update_curr_dl(rq);
 	__dequeue_task_dl(rq, p, flags);
-
-	dec_nr_running(rq);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 4df1638cfaf9b2b7ad993979a41965acab9cd156 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 19 Feb 2014 13:53:35 -0500
Subject: sched/deadline: Fix overflow to handle period==0 and deadline!=0

While debugging the crash with the bad nr_running accounting, I hit
another bug where, after running my sched deadline test, I was getting
failures to take a CPU offline. It was giving me a -EBUSY error.

Adding a bunch of trace_printk()s around, I found that the cpu
notifier that called sched_cpu_inactive() was returning a failure. The
overflow value was coming up negative?

Talking this over with Juri, the problem is that the total_bw update was
suppose to be made by dl_overflow() which, during my tests, seemed to
not be called. Adding more trace_printk()s, it wasn't that it wasn't
called, but it exited out right away with the check of new_bw being
equal to p->dl.dl_bw. The new_bw calculates the ratio between period and
runtime. The bug is that if you set a deadline, you do not need to set
a period if you plan on the period being equal to the deadline. That
is, if period is zero and deadline is not, then the system call should
set the period to be equal to the deadline. This is done elsewhere in
the code.

The fix is easy, check if period is set, and if it is not, then use the
deadline.

Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140219135335.7e74abd4@gandalf.local.home
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..24914488da41 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1952,7 +1952,7 @@ static int dl_overflow(struct task_struct *p, int policy,
 {
 
 	struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
-	u64 period = attr->sched_period;
+	u64 period = attr->sched_period ?: attr->sched_deadline;
 	u64 runtime = attr->sched_runtime;
 	u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
 	int cpus, err = -1;
-- 
cgit v1.3-14-g43fede


From e9e7cb38c21c80c82af4b16608bb4c8c5ec6a28e Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Tue, 11 Feb 2014 09:24:26 +0100
Subject: sched/core: Fix sched_rt_global_validate

Don't compare sysctl_sched_rt_runtime against sysctl_sched_rt_period if
the former is equal to RUNTIME_INF, otherwise disabling -rt bandwidth
management (with CONFIG_RT_GROUP_SCHED=n) fails.

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392107067-19907-2-git-send-email-juri.lelli@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 24914488da41..98d33c105252 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7475,7 +7475,8 @@ static int sched_rt_global_validate(void)
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
-	if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+	if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+		(sysctl_sched_rt_runtime > sysctl_sched_rt_period))
 		return -EINVAL;
 
 	return 0;
-- 
cgit v1.3-14-g43fede


From 495163420ab5398c84af96ca3eae2c6aa4a140da Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Tue, 11 Feb 2014 09:24:27 +0100
Subject: sched/core: Make dl_b->lock IRQ safe

Fix this lockdep warning:

[   44.804600] =========================================================
[   44.805746] [ INFO: possible irq lock inversion dependency detected ]
[   44.805746] 3.14.0-rc2-test+ #14 Not tainted
[   44.805746] ---------------------------------------------------------
[   44.805746] bash/3674 just changed the state of lock:
[   44.805746]  (&dl_b->lock){+.....}, at: [<ffffffff8106ad15>] sched_rt_handler+0x132/0x248
[   44.805746] but this lock was taken by another, HARDIRQ-safe lock in the past:
[   44.805746]  (&rq->lock){-.-.-.}

and interrupts could create inverse lock ordering between them.

[   44.805746]
[   44.805746] other info that might help us debug this:
[   44.805746]  Possible interrupt unsafe locking scenario:
[   44.805746]
[   44.805746]        CPU0                    CPU1
[   44.805746]        ----                    ----
[   44.805746]   lock(&dl_b->lock);
[   44.805746]                                local_irq_disable();
[   44.805746]                                lock(&rq->lock);
[   44.805746]                                lock(&dl_b->lock);
[   44.805746]   <Interrupt>
[   44.805746]     lock(&rq->lock);

by making dl_b->lock acquiring always IRQ safe.

Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392107067-19907-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 98d33c105252..33d030a133d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7422,6 +7422,7 @@ static int sched_dl_global_constraints(void)
 	u64 period = global_rt_period();
 	u64 new_bw = to_ratio(period, runtime);
 	int cpu, ret = 0;
+	unsigned long flags;
 
 	/*
 	 * Here we want to check the bandwidth not being set to some
@@ -7435,10 +7436,10 @@ static int sched_dl_global_constraints(void)
 	for_each_possible_cpu(cpu) {
 		struct dl_bw *dl_b = dl_bw_of(cpu);
 
-		raw_spin_lock(&dl_b->lock);
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		if (new_bw < dl_b->total_bw)
 			ret = -EBUSY;
-		raw_spin_unlock(&dl_b->lock);
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 
 		if (ret)
 			break;
@@ -7451,6 +7452,7 @@ static void sched_dl_do_global(void)
 {
 	u64 new_bw = -1;
 	int cpu;
+	unsigned long flags;
 
 	def_dl_bandwidth.dl_period = global_rt_period();
 	def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7466,9 @@ static void sched_dl_do_global(void)
 	for_each_possible_cpu(cpu) {
 		struct dl_bw *dl_b = dl_bw_of(cpu);
 
-		raw_spin_lock(&dl_b->lock);
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
 		dl_b->bw = new_bw;
-		raw_spin_unlock(&dl_b->lock);
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 3cf1962cdbf6b3a9e3ef21116d215bbab350ea37 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 18 Feb 2014 17:12:44 -0500
Subject: sched,numa: add cond_resched to task_numa_work

Normally task_numa_work scans over a fairly small amount of memory,
but it is possible to run into a large unpopulated part of virtual
memory, with no pages mapped. In that case, task_numa_work can run
for a while, and it may make sense to reschedule as required.

Cc: akpm@linux-foundation.org
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Reported-by: Xing Gang <gang.xing@hp.com>
Tested-by: Chegu Vinod <chegu_vinod@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392761566-24834-2-git-send-email-riel@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/fair.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..78157099b167 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1757,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
 			start = end;
 			if (pages <= 0)
 				goto out;
+
+			cond_resched();
 		} while (end != vma->vm_end);
 	}
 
-- 
cgit v1.3-14-g43fede


From 4efbc454ba68def5ef285b26ebfcfdb605b52755 Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@oracle.com>
Date: Sun, 16 Feb 2014 22:24:17 +0100
Subject: sched: Fix information leak in sys_sched_getattr()

We're copying the on-stack structure to userspace, but forgot to give
the right number of bytes to copy. This allows the calling process to
obtain up to PAGE_SIZE bytes from the stack (and possibly adjacent
kernel memory).

This fix copies only as much as we actually have on the stack
(attr->size defaults to the size of the struct) and leaves the rest of
the userspace-provided buffer untouched.

Found using kmemcheck + trinity.

Fixes: d50dde5a10f30 ("sched: Add new scheduler syscalls to support an extended scheduling parameters ABI")
Cc: Dario Faggioli <raistlin@linux.it>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392585857-10725-1-git-send-email-vegard.nossum@oracle.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 33d030a133d2..a6e7470166c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3786,7 +3786,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
 		attr->size = usize;
 	}
 
-	ret = copy_to_user(uattr, attr, usize);
+	ret = copy_to_user(uattr, attr, attr->size);
 	if (ret)
 		return -EFAULT;
 
-- 
cgit v1.3-14-g43fede


From 6d35ab48090b10c5ea5604ed5d6e91f302dc6060 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Feb 2014 17:19:29 +0100
Subject: sched: Add 'flags' argument to sched_{set,get}attr() syscalls

Because of a recent syscall design debate; its deemed appropriate for
each syscall to have a flags argument for future extension; without
immediately requiring new syscalls.

Cc: juri.lelli@gmail.com
Cc: Ingo Molnar <mingo@redhat.com>
Suggested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140214161929.GL27965@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/syscalls.h |  6 ++++--
 kernel/sched/core.c      | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40ed9e9a77e5..a747a77ea584 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -281,13 +281,15 @@ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
 asmlinkage long sys_sched_setparam(pid_t pid,
 					struct sched_param __user *param);
 asmlinkage long sys_sched_setattr(pid_t pid,
-					struct sched_attr __user *attr);
+					struct sched_attr __user *attr,
+					unsigned int flags);
 asmlinkage long sys_sched_getscheduler(pid_t pid);
 asmlinkage long sys_sched_getparam(pid_t pid,
 					struct sched_param __user *param);
 asmlinkage long sys_sched_getattr(pid_t pid,
 					struct sched_attr __user *attr,
-					unsigned int size);
+					unsigned int size,
+					unsigned int flags);
 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
 					unsigned long __user *user_mask_ptr);
 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6e7470166c7..6edbef296ece 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3661,13 +3661,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
  * @pid: the pid in question.
  * @uattr: structure containing the extended parameters.
  */
-SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+			       unsigned int, flags)
 {
 	struct sched_attr attr;
 	struct task_struct *p;
 	int retval;
 
-	if (!uattr || pid < 0)
+	if (!uattr || pid < 0 || flags)
 		return -EINVAL;
 
 	if (sched_copy_attr(uattr, &attr))
@@ -3804,8 +3805,8 @@ err_size:
  * @uattr: structure containing the extended parameters.
  * @size: sizeof(attr) for fwd/bwd comp.
  */
-SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
-		unsigned int, size)
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+		unsigned int, size, unsigned int, flags)
 {
 	struct sched_attr attr = {
 		.size = sizeof(struct sched_attr),
@@ -3814,7 +3815,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
 	int retval;
 
 	if (!uattr || pid < 0 || size > PAGE_SIZE ||
-	    size < SCHED_ATTR_SIZE_VER0)
+	    size < SCHED_ATTR_SIZE_VER0 || flags)
 		return -EINVAL;
 
 	rcu_read_lock();
-- 
cgit v1.3-14-g43fede


From 82b95800b256205cff2eeab5bbd03430d2d0f20d Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Date: Mon, 17 Feb 2014 09:12:33 -0500
Subject: sched/deadline: Test for CPU's presence explicitly

A hot-removed CPU may have ID that is numerically larger than the number of
existing CPUs in the system (e.g. we can unplug CPU 4 from a system that
has CPUs 0, 1 and 4).

Thus the WARN_ONs should check whether the CPU in question is currently
present, not whether its ID value is less than num_present_cpus().

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392646353-1874-1-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/cpudeadline.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74e3f09..5b8838b56d1c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
 
 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
 {
-	WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+	WARN_ON(!cpu_present(idx) || idx == IDX_INVALID);
 
 	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
 		cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	}
 
 out:
-	WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+	WARN_ON(!cpu_present(best_cpu) && best_cpu != -1);
 
 	return best_cpu;
 }
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
 	int old_idx, new_cpu;
 	unsigned long flags;
 
-	WARN_ON(cpu > num_present_cpus());
+	WARN_ON(!cpu_present(cpu));
 
 	raw_spin_lock_irqsave(&cp->lock, flags);
 	old_idx = cp->cpu_to_idx[cpu];
-- 
cgit v1.3-14-g43fede


From 995b9ea440862def83e8fcb1b498e68f93d4af59 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <tkhai@yandex.ru>
Date: Tue, 18 Feb 2014 02:24:13 +0400
Subject: sched/deadline: Remove useless dl_nr_total

In deadline class we do not have group scheduling like in RT.

dl_nr_total is the same as dl_nr_running. So, one of them should
be removed.

Cc: Ingo Molnar <mingo@redhat.com>
Cc: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Kirill Tkhai <tkhai@yandex.ru>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/368631392675853@web20h.yandex.ru
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 4 +---
 kernel/sched/sched.h    | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b819577c21de..15cbc17fbf84 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
 
 static void update_dl_migration(struct dl_rq *dl_rq)
 {
-	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+	if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
 		if (!dl_rq->overloaded) {
 			dl_set_overload(rq_of_dl_rq(dl_rq));
 			dl_rq->overloaded = 1;
@@ -137,7 +137,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	struct task_struct *p = dl_task_of(dl_se);
 	dl_rq = &rq_of_dl_rq(dl_rq)->dl;
 
-	dl_rq->dl_nr_total++;
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory++;
 
@@ -149,7 +148,6 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 	struct task_struct *p = dl_task_of(dl_se);
 	dl_rq = &rq_of_dl_rq(dl_rq)->dl;
 
-	dl_rq->dl_nr_total--;
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory--;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..f964add50f38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -462,7 +462,6 @@ struct dl_rq {
 	} earliest_dl;
 
 	unsigned long dl_nr_migratory;
-	unsigned long dl_nr_total;
 	int overloaded;
 
 	/*
-- 
cgit v1.3-14-g43fede


From eb7a59b2c888c2518ba2c9d0020343ca71aa9dee Mon Sep 17 00:00:00 2001
From: Michael wang <wangyun@linux.vnet.ibm.com>
Date: Thu, 20 Feb 2014 11:14:53 +0800
Subject: sched/fair: Reset se-depth when task switched to FAIR

Sasha reported:

[  522.645288] BUG: unable to handle kernel NULL pointer dereference at ...
[  522.646271] IP: [<ffffffff81186c6f>] check_preempt_wakeup+0x11f/0x210
		...
[  522.650021] Call Trace:
[  522.650021]  <IRQ>
[  522.650021]  [<ffffffff8117361d>] check_preempt_curr+0x3d/0xb0
[  522.650021]  [<ffffffff81175d88>] ttwu_do_wakeup+0x18/0x130
		...

which was caused by the se-depth changed during the time when task is not
FAIR, and we will use the wrong depth value after it switched back to FAIR.

This patch reset the depth at the time when task switched to FAIR, make sure
that we always have the correct value when task is FAIR.

Cc: Ingo Molnar <mingo@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5305732D.70001@linux.vnet.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/fair.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 235cfa7ad8fc..280da893cd0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7317,7 +7317,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-	if (!p->se.on_rq)
+	struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Since the real-depth could have been changed (only FAIR
+	 * class maintain depth value), reset depth properly.
+	 */
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+	if (!se->on_rq)
 		return;
 
 	/*
-- 
cgit v1.3-14-g43fede


From 6e83125c6b151afa139c8852c099d6d92954fe3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 11 Feb 2014 16:11:48 +0100
Subject: sched/fair: Remove idle_balance() declaration in sched.h

Remove idle_balance() from the public life; also reduce some #ifdef
clutter by folding the pick_next_task_fair() idle path into
idle_balance().

Cc: mingo@kernel.org
Reported-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140211151148.GP27965@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/fair.c  | 47 +++++++++++++++++++++++++++++------------------
 kernel/sched/sched.h |  7 -------
 2 files changed, 29 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 280da893cd0f..40c758bbdd57 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2374,13 +2374,13 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
 	}
 }
-#else
+#else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void __update_task_entity_contrib(struct sched_entity *se)
 {
@@ -2571,6 +2571,8 @@ void idle_exit_fair(struct rq *this_rq)
 	update_rq_runnable_avg(this_rq, 0);
 }
 
+static int idle_balance(struct rq *this_rq);
+
 #else /* CONFIG_SMP */
 
 static inline void update_entity_load_avg(struct sched_entity *se,
@@ -2584,6 +2586,12 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 					   int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 					      int force_update) {}
+
+static inline int idle_balance(struct rq *rq)
+{
+	return 0;
+}
+
 #endif /* CONFIG_SMP */
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4677,7 +4685,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 	struct sched_entity *se;
 	struct task_struct *p;
 
-again: __maybe_unused
+again:
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	if (!cfs_rq->nr_running)
 		goto idle;
@@ -4775,18 +4783,8 @@ simple:
 	return p;
 
 idle:
-#ifdef CONFIG_SMP
-	idle_enter_fair(rq);
-	/*
-	 * We must set idle_stamp _before_ calling idle_balance(), such that we
-	 * measure the duration of idle_balance() as idle time.
-	 */
-	rq->idle_stamp = rq_clock(rq);
-	if (idle_balance(rq)) { /* drops rq->lock */
-		rq->idle_stamp = 0;
+	if (idle_balance(rq)) /* drops rq->lock */
 		goto again;
-	}
-#endif
 
 	return NULL;
 }
@@ -6634,7 +6632,7 @@ out:
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-int idle_balance(struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
 {
 	struct sched_domain *sd;
 	int pulled_task = 0;
@@ -6642,8 +6640,15 @@ int idle_balance(struct rq *this_rq)
 	u64 curr_cost = 0;
 	int this_cpu = this_rq->cpu;
 
+	idle_enter_fair(this_rq);
+	/*
+	 * We must set idle_stamp _before_ calling idle_balance(), such that we
+	 * measure the duration of idle_balance() as idle time.
+	 */
+	this_rq->idle_stamp = rq_clock(this_rq);
+
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
-		return 0;
+		goto out;
 
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6692,8 +6697,10 @@ int idle_balance(struct rq *this_rq)
 	 * While browsing the domains, we released the rq lock.
 	 * A task could have be enqueued in the meantime
 	 */
-	if (this_rq->nr_running && !pulled_task)
-		return 1;
+	if (this_rq->nr_running && !pulled_task) {
+		pulled_task = 1;
+		goto out;
+	}
 
 	if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
 		/*
@@ -6706,6 +6713,10 @@ int idle_balance(struct rq *this_rq)
 	if (curr_cost > this_rq->max_idle_balance_cost)
 		this_rq->max_idle_balance_cost = curr_cost;
 
+out:
+	if (pulled_task)
+		this_rq->idle_stamp = 0;
+
 	return pulled_task;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c257d3b..92018f9821e8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1163,17 +1163,10 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
-extern int idle_balance(struct rq *this_rq);
 
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
-#else	/* CONFIG_SMP */
-
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
-
 #endif
 
 extern void sysrq_sched_debug_show(void);
-- 
cgit v1.3-14-g43fede


From 3f1d2a318171bf61850d4e5a72031271e5aada76 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Feb 2014 10:49:30 +0100
Subject: sched: Fix hotplug task migration

Dan Carpenter reported:

> kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
> kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)

Kirill also spotted that migrate_tasks() will have an instant NULL
deref because pick_next_task() will immediately deref prev.

Instead of fixing all the corner cases because migrate_tasks() can
pass in a NULL prev task in the unlikely case of hot-un-plug, provide
a fake task such that we can remove all the NULL checks from the far
more common paths.

A further problem; not previously spotted; is that because we pushed
pre_schedule() and idle_balance() into pick_next_task() we now need to
avoid those getting called and pulling more tasks on our dying CPU.

We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
We also note that since we call pick_next_task() exactly the amount of
times we have runnable tasks present, we should never land in
idle_balance().

Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Kirill Tkhai <tkhai@yandex.ru>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c      | 18 +++++++++++++++++-
 kernel/sched/deadline.c  |  3 +--
 kernel/sched/fair.c      |  5 ++---
 kernel/sched/idle_task.c |  3 +--
 kernel/sched/rt.c        |  3 +--
 kernel/sched/sched.h     |  5 +++++
 kernel/sched/stop_task.c |  3 +--
 7 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764fbc537..49db434a35d0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq *rq)
 		atomic_long_add(delta, &calc_load_tasks);
 }
 
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+	.put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+	/*
+	 * Avoid pull_{rt,dl}_task()
+	 */
+	.prio = MAX_PRIO + 1,
+	.sched_class = &fake_sched_class,
+};
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq, NULL);
+		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ed31ef66ab9d..bfeb84ecc32b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40c758bbdd57..e884e45982af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4690,7 +4690,7 @@ again:
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (!prev || prev->sched_class != &fair_sched_class)
+	if (prev->sched_class != &fair_sched_class)
 		goto simple;
 
 	/*
@@ -4766,8 +4766,7 @@ simple:
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 
 	do {
 		se = pick_next_entity(cfs_rq, NULL);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index f7d03af79a5b..53ff9e7c76d2 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,8 +26,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 72f9ec759972..65c2d6881ac3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 
 	p = _pick_next_task_rt(rq);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 92018f9821e8..d276147ba5e4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1147,6 +1147,11 @@ struct sched_class {
 #endif
 };
 
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+	prev->sched_class->put_prev_task(rq, prev);
+}
+
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
    for (class = sched_class_highest; class; class = class->next)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index a4147c9d2017..d6ce65dde541 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 	if (!stop || !stop->on_rq)
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 
 	stop->se.exec_start = rq_clock_task(rq);
 
-- 
cgit v1.3-14-g43fede


From dc87734106bb6e97c92d8bd81f261fb71976ec2c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Feb 2014 15:47:29 +0100
Subject: sched: Remove some #ifdeffery

Remove a few gratuitous #ifdefs in pick_next_task*().

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-nnzddp5c4fijyzzxxrwlxghf@git.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c  | 31 +++++++++++++++++++++++++------
 kernel/sched/idle_task.c |  4 ----
 kernel/sched/rt.c        | 41 ++++++++++++++++++++++++++++++-----------
 kernel/sched/sched.h     |  5 +++++
 4 files changed, 60 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index bfeb84ecc32b..3185b775dbf7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -214,6 +214,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
 
 static int push_dl_task(struct rq *rq);
 
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+	return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+	rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
 #else
 
 static inline
@@ -236,6 +246,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 }
 
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+	return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+	return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -1000,10 +1023,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 
 	dl_rq = &rq->dl;
 
-#ifdef CONFIG_SMP
-	if (dl_task(prev))
+	if (need_pull_dl_task(rq, prev))
 		pull_dl_task(rq);
-#endif
 
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
@@ -1024,9 +1045,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 		start_hrtick_dl(rq, p);
 #endif
 
-#ifdef CONFIG_SMP
-	rq->post_schedule = has_pushable_dl_tasks(rq);
-#endif /* CONFIG_SMP */
+	set_post_schedule(rq);
 
 	return p;
 }
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 53ff9e7c76d2..1f3725882838 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -29,9 +29,7 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 	put_prev_task(rq, prev);
 
 	schedstat_inc(rq, sched_goidle);
-#ifdef CONFIG_SMP
 	idle_enter_fair(rq);
-#endif
 	return rq->idle;
 }
 
@@ -50,10 +48,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
-#ifdef CONFIG_SMP
 	idle_exit_fair(rq);
 	rq_last_tick_reset(rq);
-#endif
 }
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 65c2d6881ac3..3e488ca6050d 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -231,6 +231,12 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 static int pull_rt_task(struct rq *this_rq);
 
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+	/* Try to pull RT tasks here if we lower this rq's prio */
+	return rq->rt.highest_prio.curr > prev->prio;
+}
+
 static inline int rt_overloaded(struct rq *rq)
 {
 	return atomic_read(&rq->rd->rto_count);
@@ -317,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
+static inline void set_post_schedule(struct rq *rq)
+{
+	/*
+	 * We detect this state here so that we can avoid taking the RQ
+	 * lock again later if there is no need to push
+	 */
+	rq->post_schedule = has_pushable_tasks(rq);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -361,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 }
 
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+	return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+	return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -1332,11 +1360,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
 
-#ifdef CONFIG_SMP
-	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (rq->rt.highest_prio.curr > prev->prio)
+	if (need_pull_rt_task(rq, prev))
 		pull_rt_task(rq);
-#endif
 
 	if (!rt_rq->rt_nr_running)
 		return NULL;
@@ -1352,13 +1377,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	if (p)
 		dequeue_pushable_task(rq, p);
 
-#ifdef CONFIG_SMP
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
-#endif
+	set_post_schedule(rq);
 
 	return p;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d276147ba5e4..caf4abda45e3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1172,6 +1172,11 @@ extern void trigger_load_balance(struct rq *rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
 
+#else
+
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
+
 #endif
 
 extern void sysrq_sched_debug_show(void);
-- 
cgit v1.3-14-g43fede


From cd578abb24aa67ce468c427d3356c08ea32cf768 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 11 Feb 2014 16:01:16 +0100
Subject: perf/x86: Warn to early_printk() in case irq_work is too slow

On Mon, Feb 10, 2014 at 08:45:16AM -0800, Dave Hansen wrote:
> The reason I coded this up was that NMIs were firing off so fast that
> nothing else was getting a chance to run.  With this patch, at least the
> printk() would come out and I'd have some idea what was going on.

It will start spewing to early_printk() (which is a lot nicer to use
from NMI context too) when it fails to queue the IRQ-work because its
already enqueued.

It does have the false-positive for when two CPUs trigger the warn
concurrently, but that should be rare and some extra clutter on the
early printk shouldn't be a problem.

Cc: hpa@zytor.com
Cc: tglx@linutronix.de
Cc: dzickus@redhat.com
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: mingo@kernel.org
Fixes: 6a02ad66b2c4 ("perf/x86: Push the duration-logging printk() to IRQ context")
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140211150116.GO27965@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq_work.h | 2 +-
 kernel/events/core.c     | 9 +++++++--
 kernel/irq_work.c        | 6 ++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h
index add13c8624b7..19ae05d4b8ec 100644
--- a/include/linux/irq_work.h
+++ b/include/linux/irq_work.h
@@ -32,7 +32,7 @@ void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *))
 
 #define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { .func = (_f), }
 
-void irq_work_queue(struct irq_work *work);
+bool irq_work_queue(struct irq_work *work);
 void irq_work_run(void);
 void irq_work_sync(struct irq_work *work);
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2067cbb378eb..45e5543e2a1e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -243,7 +243,7 @@ static void perf_duration_warn(struct irq_work *w)
 	printk_ratelimited(KERN_WARNING
 			"perf interrupt took too long (%lld > %lld), lowering "
 			"kernel.perf_event_max_sample_rate to %d\n",
-			avg_local_sample_len, allowed_ns,
+			avg_local_sample_len, allowed_ns >> 1,
 			sysctl_perf_event_sample_rate);
 }
 
@@ -283,7 +283,12 @@ void perf_sample_event_took(u64 sample_len_ns)
 
 	update_perf_cpu_limits();
 
-	irq_work_queue(&perf_duration_work);
+	if (!irq_work_queue(&perf_duration_work)) {
+		early_printk("perf interrupt took too long (%lld > %lld), lowering "
+			     "kernel.perf_event_max_sample_rate to %d\n",
+			     avg_local_sample_len, allowed_ns >> 1,
+			     sysctl_perf_event_sample_rate);
+	}
 }
 
 static atomic64_t perf_event_id;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065cf..a82170e2fa78 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
  *
  * Can be re-enqueued while the callback is still in progress.
  */
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
 {
 	/* Only queue if not already pending */
 	if (!irq_work_claim(work))
-		return;
+		return false;
 
 	/* Queue the entry and raise the IPI if needed. */
 	preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
 	}
 
 	preempt_enable();
+
+	return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
 
-- 
cgit v1.3-14-g43fede


From 77177856e3bf39d435b3ae4bfd164ca3c8cd4577 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:37 +0100
Subject: sched: Init idle->on_rq in init_idle()

We stumbled in RT over a SMP bringup issue on ARM where the
idle->on_rq == 0 was causing try_to_wakeup() on the other cpu to run
into nada land.

After adding that idle->on_rq = 1; I was able to find the root cause
of the lockup: the idle task on the newly woken up cpu was fiddling
with a sleeping spinlock, which is a nono.

I kept the init of idle->on_rq to keep the state consistent and to
avoid another long lasting debug session.

As a side note, the whole debug mess could have been avoided if
might_sleep() would have yelled when called from the idle task. That's
fixed with patch 2/6 - and that one actually has a changelog :)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-2-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 49db434a35d0..06da865043ec 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4443,6 +4443,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	rcu_read_unlock();
 
 	rq->curr = rq->idle = idle;
+	idle->on_rq = 1;
 #if defined(CONFIG_SMP)
 	idle->on_cpu = 1;
 #endif
-- 
cgit v1.3-14-g43fede


From db273be2a7d42f92b3471e0f717982928214a650 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:38 +0100
Subject: sched: Check for idle task in might_sleep()

Idle is not allowed to call sleeping functions ever!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-3-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06da865043ec..a01fe6cfdb9b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6927,7 +6927,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
-	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
 	    system_state != SYSTEM_RUNNING || oops_in_progress)
 		return;
 	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-- 
cgit v1.3-14-g43fede


From 8f47b1871b8aac98f1a9d93bc3467fb97b65199a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:39 +0100
Subject: sched: Add better debug output for might_sleep()

might_sleep() can tell us where interrupts have been disabled, but we
have no idea what disabled preemption. Add some debug infrastructure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-4-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  3 +++
 kernel/sched/core.c   | 23 +++++++++++++++++++++--
 2 files changed, 24 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c49a2585ff7d..825ed838d4b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1463,6 +1463,9 @@ struct task_struct {
 	struct mutex perf_event_mutex;
 	struct list_head perf_event_list;
 #endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	unsigned long preempt_disable_ip;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a01fe6cfdb9b..c94e851dc981 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2501,8 +2501,13 @@ void __kprobes preempt_count_add(int val)
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
 #endif
-	if (preempt_count() == val)
-		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+	if (preempt_count() == val) {
+		unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
 }
 EXPORT_SYMBOL(preempt_count_add);
 
@@ -2545,6 +2550,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
 	print_modules();
 	if (irqs_disabled())
 		print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -6946,6 +6958,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	debug_show_held_locks(current);
 	if (irqs_disabled())
 		print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
 	dump_stack();
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v1.3-14-g43fede


From d6b1e9119787fd2e31dcf0f0ce90b71197604206 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:40 +0100
Subject: sched: Adjust p->sched_reset_on_fork when nothing else changes

If the policy and priority remain unchanged a possible modification of
p->sched_reset_on_fork gets lost in the early exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Rebase ontop of v3.14-rc1. ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-5-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94e851dc981..771eb8762df4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3362,7 +3362,8 @@ recheck:
 	}
 
 	/*
-	 * If not changing anything there's no need to proceed further:
+	 * If not changing anything there's no need to proceed further,
+	 * but store a possible modification of reset_on_fork.
 	 */
 	if (unlikely(policy == p->policy)) {
 		if (fair_policy(policy) && attr->sched_nice != task_nice(p))
@@ -3372,6 +3373,7 @@ recheck:
 		if (dl_policy(policy))
 			goto change;
 
+		p->sched_reset_on_fork = reset_on_fork;
 		task_rq_unlock(rq, p, &flags);
 		return 0;
 	}
-- 
cgit v1.3-14-g43fede


From 81a44c5441d7f7d2c3dc9105f4d65ad0d5818617 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:41 +0100
Subject: sched: Queue RT tasks to head when prio drops

The following scenario does not work correctly:

Runqueue of CPUx contains two runnable and pinned tasks:

 T1: SCHED_FIFO, prio 80
 T2: SCHED_FIFO, prio 80

T1 is on the cpu and executes the following syscalls (classic priority
ceiling scenario):

 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 90);
 ...
 sys_sched_setscheduler(pid(T1), SCHED_FIFO, .prio = 80);
 ...

Now T1 gets preempted by T3 (SCHED_FIFO, prio 95). After T3 goes back
to sleep the scheduler picks T2. Surprise!

The same happens w/o actual preemption when T1 is forced into the
scheduler due to a sporadic NEED_RESCHED event. The scheduler invokes
pick_next_task() which returns T2. So T1 gets preempted and scheduled
out.

This happens because sched_setscheduler() dequeues T1 from the prio 90
list and then enqueues it on the tail of the prio 80 list behind T2.
This violates the POSIX spec and surprises user space which relies on
the guarantee that SCHED_FIFO tasks are not scheduled out unless they
give the CPU up voluntarily or are preempted by a higher priority
task. In the latter case the preempted task must get back on the CPU
after the preempting task schedules out again.

We fixed a similar issue already in commit 60db48c (sched: Queue a
deboosted task to the head of the RT prio queue). The same treatment
is necessary for sched_setscheduler(). So enqueue to head of the prio
bucket list if the priority of the task is lowered.

It might be possible that existing user space relies on the current
behaviour, but it can be considered highly unlikely due to the corner
case nature of the application scenario.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-6-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 771eb8762df4..9c2fcbf9a266 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3442,8 +3442,13 @@ change:
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
-		enqueue_task(rq, p, 0);
+	if (on_rq) {
+		/*
+		 * We enqueue to tail when the priority of a task is
+		 * increased (user space view).
+		 */
+		enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
-- 
cgit v1.3-14-g43fede


From c365c292d05908c6ea6f32708f331e21033fe71d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 7 Feb 2014 20:58:42 +0100
Subject: sched: Consider pi boosting in setscheduler()

If a PI boosted task policy/priority is modified by a setscheduler()
call we unconditionally dequeue and requeue the task if it is on the
runqueue even if the new priority is lower than the current effective
boosted priority. This can result in undesired reordering of the
priority bucket list.

If the new priority is less or equal than the current effective we
just store the new parameters in the task struct and leave the
scheduler class and the runqueue untouched. This is handled when the
task deboosts itself. Only if the new priority is higher than the
effective boosted priority we apply the change immediately.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Rebase ontop of v3.14-rc1. ]
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1391803122-4425-7-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/rt.h |  7 +++++++
 kernel/locking/rtmutex.c | 12 ++++++++++++
 kernel/sched/core.c      | 41 ++++++++++++++++++++++++++++++-----------
 3 files changed, 49 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f7453d4c5613..6341f5be6e24 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,6 +18,7 @@ static inline int rt_task(struct task_struct *p)
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
+extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
 extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task);
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
@@ -29,6 +30,12 @@ static inline int rt_mutex_getprio(struct task_struct *p)
 {
 	return p->normal_prio;
 }
+
+static inline int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	return 0;
+}
+
 static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 {
 	return NULL;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab81..aa4dff04b594 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -212,6 +212,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 	return task_top_pi_waiter(task)->task;
 }
 
+/*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+	if (!task_has_pi_waiters(task))
+		return 0;
+
+	return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+
 /*
  * Adjust the priority of a task, after its pi_waiters got modified.
  *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c2fcbf9a266..003263b3b05c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2902,7 +2902,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
  *
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
@@ -3171,9 +3172,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 	dl_se->dl_new = 1;
 }
 
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
-			   const struct sched_attr *attr)
+static void __setscheduler_params(struct task_struct *p,
+		const struct sched_attr *attr)
 {
 	int policy = attr->sched_policy;
 
@@ -3193,9 +3193,14 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 	 * getparam()/getattr() don't report silly values for !rt tasks.
 	 */
 	p->rt_priority = attr->sched_priority;
+	set_load_weight(p);
+}
 
-	p->normal_prio = normal_prio(p);
-	p->prio = rt_mutex_getprio(p);
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+			   const struct sched_attr *attr)
+{
+	__setscheduler_params(p, attr);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -3203,8 +3208,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 		p->sched_class = &rt_sched_class;
 	else
 		p->sched_class = &fair_sched_class;
-
-	set_load_weight(p);
 }
 
 static void
@@ -3257,6 +3260,7 @@ static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user)
 {
+	int newprio = MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
@@ -3427,6 +3431,24 @@ change:
 		return -EBUSY;
 	}
 
+	p->sched_reset_on_fork = reset_on_fork;
+	oldprio = p->prio;
+
+	/*
+	 * Special case for priority boosted tasks.
+	 *
+	 * If the new priority is lower or equal (user space view)
+	 * than the current (boosted) priority, we just store the new
+	 * normal parameters and do not touch the scheduler class and
+	 * the runqueue. This will be done when the task deboost
+	 * itself.
+	 */
+	if (rt_mutex_check_prio(p, newprio)) {
+		__setscheduler_params(p, attr);
+		task_rq_unlock(rq, p, &flags);
+		return 0;
+	}
+
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
 	if (on_rq)
@@ -3434,9 +3456,6 @@ change:
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
-	p->sched_reset_on_fork = reset_on_fork;
-
-	oldprio = p->prio;
 	prev_class = p->sched_class;
 	__setscheduler(rq, p, attr);
 
-- 
cgit v1.3-14-g43fede


From d82fd25356b902703152c1800845661835541878 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sat, 8 Feb 2014 14:17:26 +0800
Subject: sched/rt: Remove 'leaf_rt_rq_list' from 'struct rq'

This is a leftover from commit e23ee74777f389369431d77390c4b09332ce026a
("sched/rt: Simplify pull_rt_task() logic and remove .leaf_rt_rq_list").

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/52F5CBF6.4060901@huawei.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 1 -
 kernel/sched/sched.h | 4 ----
 2 files changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 003263b3b05c..cc4965e969b1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6876,7 +6876,6 @@ void __init sched_init(void)
 
 		rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
 #ifdef CONFIG_RT_GROUP_SCHED
-		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
 		init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index caf4abda45e3..d608125b36ef 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -543,10 +543,6 @@ struct rq {
 	struct list_head leaf_cfs_rq_list;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-#ifdef CONFIG_RT_GROUP_SCHED
-	struct list_head leaf_rt_rq_list;
-#endif
-
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
-- 
cgit v1.3-14-g43fede


From 11c785b79ef2a669e4bf7be5cf2c3904b8fed015 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sat, 8 Feb 2014 14:17:45 +0800
Subject: sched/rt: Make init_sched_rt_calss() __init

It's a bootstrap function.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/52F5CC09.1080502@huawei.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3e488ca6050d..4d4b386598aa 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1849,7 +1849,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 		resched_task(rq->curr);
 }
 
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
 {
 	unsigned int i;
 
-- 
cgit v1.3-14-g43fede


From d277d868dab6537a85f4757e39648b1d6afc60d5 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 11 Feb 2014 15:34:49 +0800
Subject: rcu: Use MAX_NICE to replace hardcoding of 19

Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/5b3bf232f41b33ab703a1595e94671b303e2d1fc.1392103744.git.yangds.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/rcu/torture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 732f8ae3086a..219761db1a46 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -805,7 +805,7 @@ rcu_torture_writer(void *arg)
 	static DEFINE_RCU_RANDOM(rand);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
-	set_user_nice(current, 19);
+	set_user_nice(current, MAX_NICE);
 
 	do {
 		schedule_timeout_uninterruptible(1);
@@ -871,7 +871,7 @@ rcu_torture_fakewriter(void *arg)
 	DEFINE_RCU_RANDOM(rand);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
-	set_user_nice(current, 19);
+	set_user_nice(current, MAX_NICE);
 
 	do {
 		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
@@ -987,7 +987,7 @@ rcu_torture_reader(void *arg)
 	unsigned long long ts;
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
-	set_user_nice(current, 19);
+	set_user_nice(current, MAX_NICE);
 	if (irqreader && cur_ops->irq_capable)
 		setup_timer_on_stack(&t, rcu_torture_timer, 0);
 
@@ -1584,7 +1584,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 
 	init_rcu_head_on_stack(&rcu);
 	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
-	set_user_nice(current, 19);
+	set_user_nice(current, MAX_NICE);
 	do {
 		wait_event(barrier_cbs_wq[myid],
 			   (newphase =
-- 
cgit v1.3-14-g43fede


From 75e45d512f257beedae0d8a67d053cde5537bd4c Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 11 Feb 2014 15:34:50 +0800
Subject: sched: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/bd80780f19b4f9b4a765acc353c8dbc130274dd6.1392103744.git.yangds.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/auto_group.c |  2 +-
 kernel/sched/core.c       | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58e..e73efba98301 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
 	struct autogroup *ag;
 	int err;
 
-	if (nice < -20 || nice > 19)
+	if (nice < MIN_NICE || nice > MAX_NICE)
 		return -EINVAL;
 
 	err = security_task_setnice(current, nice);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index cc4965e969b1..a8a73b8897bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2993,7 +2993,7 @@ void set_user_nice(struct task_struct *p, long nice)
 	unsigned long flags;
 	struct rq *rq;
 
-	if (task_nice(p) == nice || nice < -20 || nice > 19)
+	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
 		return;
 	/*
 	 * We have to be careful, if called from sys_setpriority(),
@@ -3072,10 +3072,10 @@ SYSCALL_DEFINE1(nice, int, increment)
 		increment = 40;
 
 	nice = task_nice(current) + increment;
-	if (nice < -20)
-		nice = -20;
-	if (nice > 19)
-		nice = 19;
+	if (nice < MIN_NICE)
+		nice = MIN_NICE;
+	if (nice > MAX_NICE)
+		nice = MAX_NICE;
 
 	if (increment < 0 && !can_nice(current, nice))
 		return -EPERM;
@@ -3623,7 +3623,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
 	 * XXX: do we want to be lenient like existing syscalls; or do we want
 	 * to be strict and return an error on out-of-bounds values?
 	 */
-	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
 
 out:
 	return ret;
-- 
cgit v1.3-14-g43fede


From c4a4d2f43177f6165132c6d36a4d46963018f726 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 11 Feb 2014 15:34:51 +0800
Subject: sys: Replace hardcoding of -20 and 19 with MIN_NICE and MAX_NICE

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Robin Holt <holt@sgi.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Kees Cook <keescook@chromium.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Link: http://lkml.kernel.org/r/0261f094b836f1acbcdf52e7166487c0c77323c8.1392103744.git.yangds.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sys.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a4..adaeab6f7a87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 
 	/* normalize: avoid signed division (rounding problems) */
 	error = -ESRCH;
-	if (niceval < -20)
-		niceval = -20;
-	if (niceval > 19)
-		niceval = 19;
+	if (niceval < MIN_NICE)
+		niceval = MIN_NICE;
+	if (niceval > MAX_NICE)
+		niceval = MAX_NICE;
 
 	rcu_read_lock();
 	read_lock(&tasklist_lock);
-- 
cgit v1.3-14-g43fede


From 144818422bfa272531250473b888394c21e6fe19 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Tue, 11 Feb 2014 15:34:52 +0800
Subject: workqueue: Replace hardcoding of -20 and 19 with MIN_NICE and
 MAX_NICE

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/6d85138180c00ce86975addab6e34b24b84f00a5.1392103744.git.yangds.fnst@cn.fujitsu.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 193e977a10ea..3fa5b8f3aae3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3225,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
 		return -ENOMEM;
 
 	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-	    attrs->nice >= -20 && attrs->nice <= 19)
+	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
 		ret = apply_workqueue_attrs(wq, attrs);
 	else
 		ret = -EINVAL;
-- 
cgit v1.3-14-g43fede


From de91b9cb97fe68cb6ef0cfe9bee09d015c152af8 Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Tue, 18 Feb 2014 14:14:24 +0000
Subject: sched: Fix select_task_rq_fair() description comments

Brings select_task_rq_fair() description comments up-to-date.

Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1392732864-10927-1-git-send-email-morten.rasmussen@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e884e45982af..7982faf7223b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4397,13 +4397,14 @@ done:
 }
 
 /*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
  *
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
  *
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
  *
  * preempt must be disabled.
  */
-- 
cgit v1.3-14-g43fede


From d987fc7f3228bf94cb6b21313ebab1d64ee637ad Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Mon, 5 Dec 2011 10:01:47 +0100
Subject: sched, nohz: Exclude isolated cores from load balancing

The user explicitly disabled load balancing, else this core would not be
disconnected.  Don't add these to nohz.idle_cpus_mask.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Lei Wen <leiwen@marvell.com>
Link: http://lkml.kernel.org/n/tip-vmme4f49psirp966pklm5l9j@git.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7982faf7223b..a3a41c61a2c9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6788,6 +6788,11 @@ out_unlock:
 	return 0;
 }
 
+static inline int on_null_domain(struct rq *rq)
+{
+	return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * idle load balancing details
@@ -6842,8 +6847,13 @@ static void nohz_balancer_kick(void)
 static inline void nohz_balance_exit_idle(int cpu)
 {
 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
-		cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-		atomic_dec(&nohz.nr_cpus);
+		/*
+		 * Completely isolated CPUs don't ever set, so we must test.
+		 */
+		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+			atomic_dec(&nohz.nr_cpus);
+		}
 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 	}
 }
@@ -6897,6 +6907,12 @@ void nohz_balance_enter_idle(int cpu)
 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
 		return;
 
+	/*
+	 * If we're a completely isolated CPU, we don't play.
+	 */
+	if (on_null_domain(cpu_rq(cpu)))
+		return;
+
 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
 	atomic_inc(&nohz.nr_cpus);
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -7159,11 +7175,6 @@ static void run_rebalance_domains(struct softirq_action *h)
 	nohz_idle_balance(this_rq, idle);
 }
 
-static inline int on_null_domain(struct rq *rq)
-{
-	return !rcu_dereference_sched(rq->sd);
-}
-
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  */
-- 
cgit v1.3-14-g43fede


From 806274c018e9858320a27b785df761f45c33a56c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 17 Feb 2014 13:13:05 -0800
Subject: rcutorture: Fix checkpatch complaint

This commit does a code-style cleanup so that the first curly brace
of an initializer does not appear at the beginning of a line.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/torture.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
index 732f8ae3086a..dad67238d086 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/torture.c
@@ -170,10 +170,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
 static unsigned long rcu_torture_current_version;
 static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
 static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
-	{ 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
-	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+		      rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+		      rcu_torture_batch) = { 0 };
 static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 static atomic_t n_rcu_torture_alloc;
 static atomic_t n_rcu_torture_alloc_fail;
-- 
cgit v1.3-14-g43fede


From 51b1130eb5823ddb90a9ad07d243031d8cb7ecf2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 27 Jan 2014 11:49:39 -0800
Subject: rcutorture: Abstract rcu_torture_random()

Because rcu_torture_random() will be used by the locking equivalent to
rcutorture, pull it out into its own module.  This new module cannot
be separately configured, instead, use the Kconfig "select" statement
from the Kconfig options of tests depending on it.

Suggested-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h |   47 ++
 kernel/Makefile         |    1 +
 kernel/rcu/Makefile     |    2 +-
 kernel/rcu/rcutorture.c | 2125 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcu/torture.c    | 2148 -----------------------------------------------
 kernel/torture.c        |   71 ++
 lib/Kconfig.debug       |    5 +
 7 files changed, 2250 insertions(+), 2149 deletions(-)
 create mode 100644 include/linux/torture.h
 create mode 100644 kernel/rcu/rcutorture.c
 delete mode 100644 kernel/rcu/torture.c
 create mode 100644 kernel/torture.c

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
new file mode 100644
index 000000000000..979e3e6b378a
--- /dev/null
+++ b/include/linux/torture.h
@@ -0,0 +1,47 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_TORTURE_H
+#define __LINUX_TORTURE_H
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+#include <linux/lockdep.h>
+#include <linux/completion.h>
+#include <linux/debugobjects.h>
+#include <linux/bug.h>
+#include <linux/compiler.h>
+
+struct torture_random_state {
+	unsigned long trs_state;
+	long trs_count;
+};
+
+#define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 }
+
+unsigned long torture_random(struct torture_random_state *trsp);
+
+#endif /* __LINUX_TORTURE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..5c0e7666811d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e3..807ccfbf69b3 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
 obj-y += update.o srcu.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
 obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
new file mode 100644
index 000000000000..94b1cd8b214c
--- /dev/null
+++ b/kernel/rcu/rcutorture.c
@@ -0,0 +1,2125 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *	  Josh Triplett <josh@freedesktop.org>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
+
+MODULE_ALIAS("rcutorture");
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "rcutorture."
+
+static int fqs_duration;
+module_param(fqs_duration, int, 0444);
+MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
+static int fqs_holdoff;
+module_param(fqs_holdoff, int, 0444);
+MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
+static int fqs_stutter = 3;
+module_param(fqs_stutter, int, 0444);
+MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+static bool gp_exp;
+module_param(gp_exp, bool, 0444);
+MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
+static bool gp_normal;
+module_param(gp_normal, bool, 0444);
+MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
+static int irqreader = 1;
+module_param(irqreader, int, 0444);
+MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
+static int n_barrier_cbs;
+module_param(n_barrier_cbs, int, 0444);
+MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
+static int nfakewriters = 4;
+module_param(nfakewriters, int, 0444);
+MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
+static int nreaders = -1;
+module_param(nreaders, int, 0444);
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+static int object_debug;
+module_param(object_debug, int, 0444);
+MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
+static int onoff_holdoff;
+module_param(onoff_holdoff, int, 0444);
+MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
+static int onoff_interval;
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+static int shuffle_interval = 3;
+module_param(shuffle_interval, int, 0444);
+MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
+static int shutdown_secs;
+module_param(shutdown_secs, int, 0444);
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
+static int stall_cpu;
+module_param(stall_cpu, int, 0444);
+MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
+static int stall_cpu_holdoff = 10;
+module_param(stall_cpu_holdoff, int, 0444);
+MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
+static int stat_interval = 60;
+module_param(stat_interval, int, 0644);
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+static int stutter = 5;
+module_param(stutter, int, 0444);
+MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
+static int test_boost = 1;
+module_param(test_boost, int, 0444);
+MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+static int test_boost_duration = 4;
+module_param(test_boost_duration, int, 0444);
+MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
+static int test_boost_interval = 7;
+module_param(test_boost_interval, int, 0444);
+MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
+static bool test_no_idle_hz = true;
+module_param(test_no_idle_hz, bool, 0444);
+MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+static char *torture_type = "rcu";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+static bool verbose;
+module_param(verbose, bool, 0444);
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+
+#define TORTURE_FLAG "-torture:"
+#define PRINTK_STRING(s) \
+	do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **fakewriter_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+static struct task_struct *shuffler_task;
+static struct task_struct *stutter_task;
+static struct task_struct *fqs_task;
+static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *shutdown_task;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *onoff_task;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static struct task_struct *stall_task;
+static struct task_struct **barrier_cbs_tasks;
+static struct task_struct *barrier_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+	int rtort_mbtest;
+};
+
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture __rcu *rcu_torture_current;
+static unsigned long rcu_torture_current_version;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+		      rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+		      rcu_torture_batch) = { 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+static atomic_t n_rcu_torture_alloc;
+static atomic_t n_rcu_torture_alloc_fail;
+static atomic_t n_rcu_torture_free;
+static atomic_t n_rcu_torture_mberror;
+static atomic_t n_rcu_torture_error;
+static long n_rcu_torture_barrier_error;
+static long n_rcu_torture_boost_ktrerror;
+static long n_rcu_torture_boost_rterror;
+static long n_rcu_torture_boost_failure;
+static long n_rcu_torture_boosts;
+static long n_rcu_torture_timers;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+static long n_barrier_attempts;
+static long n_barrier_successes;
+static struct list_head rcu_torture_removed;
+static cpumask_var_t shuffle_tmp_mask;
+
+static int stutter_pause_test;
+
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
+
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
+#define rcu_can_boost() 1
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+#define rcu_can_boost() 0
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+
+#ifdef CONFIG_RCU_TRACE
+static u64 notrace rcu_trace_clock_local(void)
+{
+	u64 ts = trace_clock_local();
+	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+	return ts;
+}
+#else /* #ifdef CONFIG_RCU_TRACE */
+static u64 notrace rcu_trace_clock_local(void)
+{
+	return 0ULL;
+}
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+static unsigned long shutdown_time;	/* jiffies to system shutdown. */
+static unsigned long boost_starttime;	/* jiffies of next boost test start. */
+DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
+					/*  and boost task create/destroy. */
+static atomic_t barrier_cbs_count;	/* Barrier callbacks registered. */
+static bool barrier_phase;		/* Test phase. */
+static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
+static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
+static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
+
+/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
+
+#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
+#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
+static int fullstop = FULLSTOP_RMMOD;
+/*
+ * Protect fullstop transitions and spawning of kthreads.
+ */
+static DEFINE_MUTEX(fullstop_mutex);
+
+/* Forward reference. */
+static void rcu_torture_cleanup(void);
+
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int
+rcutorture_shutdown_notify(struct notifier_block *unused1,
+			   unsigned long unused2, void *unused3)
+{
+	mutex_lock(&fullstop_mutex);
+	if (fullstop == FULLSTOP_DONTSTOP)
+		fullstop = FULLSTOP_SHUTDOWN;
+	else
+		pr_warn(/* but going down anyway, so... */
+		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
+	mutex_unlock(&fullstop_mutex);
+	return NOTIFY_DONE;
+}
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+static void rcutorture_shutdown_absorb(const char *title)
+{
+	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+		pr_notice(
+		       "rcutorture thread %s parking due to system shutdown\n",
+		       title);
+		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+	}
+}
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+static struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock_bh(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock_bh(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock_bh(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock_bh(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock_bh(&rcu_torture_lock);
+}
+
+static void
+rcu_stutter_wait(const char *title)
+{
+	while (stutter_pause_test || !rcutorture_runnable) {
+		if (rcutorture_runnable)
+			schedule_timeout_interruptible(1);
+		else
+			schedule_timeout_interruptible(round_jiffies_relative(HZ));
+		rcutorture_shutdown_absorb(title);
+	}
+}
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_torture_ops {
+	void (*init)(void);
+	int (*readlock)(void);
+	void (*read_delay)(struct torture_random_state *rrsp);
+	void (*readunlock)(int idx);
+	int (*completed)(void);
+	void (*deferred_free)(struct rcu_torture *p);
+	void (*sync)(void);
+	void (*exp_sync)(void);
+	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+	void (*cb_barrier)(void);
+	void (*fqs)(void);
+	void (*stats)(char *page);
+	int irq_capable;
+	int can_boost;
+	const char *name;
+};
+
+static struct rcu_torture_ops *cur_ops;
+
+/*
+ * Definitions for rcu torture testing.
+ */
+
+static int rcu_torture_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void rcu_read_delay(struct torture_random_state *rrsp)
+{
+	const unsigned long shortdelay_us = 200;
+	const unsigned long longdelay_ms = 50;
+
+	/* We want a short delay sometimes to make a reader delay the grace
+	 * period, and we want a long delay occasionally to trigger
+	 * force_quiescent_state. */
+
+	if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+		mdelay(longdelay_ms);
+	if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+		udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!preempt_count() &&
+	    !(torture_random(rrsp) % (nrealreaders * 20000)))
+		preempt_schedule();  /* No QS if preempt_disable() in effect */
+#endif
+}
+
+static void rcu_torture_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static int rcu_torture_completed(void)
+{
+	return rcu_batches_completed();
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop != FULLSTOP_DONTSTOP) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+		rp->rtort_mbtest = 0;
+		rcu_torture_free(rp);
+	} else {
+		cur_ops->deferred_free(rp);
+	}
+}
+
+static int rcu_no_completed(void)
+{
+	return 0;
+}
+
+static void rcu_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static void rcu_sync_torture_init(void)
+{
+	INIT_LIST_HEAD(&rcu_torture_removed);
+}
+
+static struct rcu_torture_ops rcu_ops = {
+	.init		= rcu_sync_torture_init,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_torture_completed,
+	.deferred_free	= rcu_torture_deferred_free,
+	.sync		= synchronize_rcu,
+	.exp_sync	= synchronize_rcu_expedited,
+	.call		= call_rcu,
+	.cb_barrier	= rcu_barrier,
+	.fqs		= rcu_force_quiescent_state,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.can_boost	= rcu_can_boost(),
+	.name		= "rcu"
+};
+
+/*
+ * Definitions for rcu_bh torture testing.
+ */
+
+static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
+{
+	rcu_read_lock_bh();
+	return 0;
+}
+
+static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
+{
+	rcu_read_unlock_bh();
+}
+
+static int rcu_bh_torture_completed(void)
+{
+	return rcu_batches_completed_bh();
+}
+
+static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops rcu_bh_ops = {
+	.init		= rcu_sync_torture_init,
+	.readlock	= rcu_bh_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_bh_torture_read_unlock,
+	.completed	= rcu_bh_torture_completed,
+	.deferred_free	= rcu_bh_torture_deferred_free,
+	.sync		= synchronize_rcu_bh,
+	.exp_sync	= synchronize_rcu_bh_expedited,
+	.call		= call_rcu_bh,
+	.cb_barrier	= rcu_barrier_bh,
+	.fqs		= rcu_bh_force_quiescent_state,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_bh"
+};
+
+/*
+ * Definitions for srcu torture testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl);
+
+static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
+{
+	return srcu_read_lock(&srcu_ctl);
+}
+
+static void srcu_read_delay(struct torture_random_state *rrsp)
+{
+	long delay;
+	const long uspertick = 1000000 / HZ;
+	const long longdelay = 10;
+
+	/* We want there to be long-running readers, but not all the time. */
+
+	delay = torture_random(rrsp) %
+		(nrealreaders * 2 * longdelay * uspertick);
+	if (!delay)
+		schedule_timeout_interruptible(longdelay);
+	else
+		rcu_read_delay(rrsp);
+}
+
+static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
+{
+	srcu_read_unlock(&srcu_ctl, idx);
+}
+
+static int srcu_torture_completed(void)
+{
+	return srcu_batches_completed(&srcu_ctl);
+}
+
+static void srcu_torture_deferred_free(struct rcu_torture *rp)
+{
+	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
+}
+
+static void srcu_torture_synchronize(void)
+{
+	synchronize_srcu(&srcu_ctl);
+}
+
+static void srcu_torture_call(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head))
+{
+	call_srcu(&srcu_ctl, head, func);
+}
+
+static void srcu_torture_barrier(void)
+{
+	srcu_barrier(&srcu_ctl);
+}
+
+static void srcu_torture_stats(char *page)
+{
+	int cpu;
+	int idx = srcu_ctl.completed & 0x1;
+
+	page += sprintf(page, "%s%s per-CPU(idx=%d):",
+		       torture_type, TORTURE_FLAG, idx);
+	for_each_possible_cpu(cpu) {
+		page += sprintf(page, " %d(%lu,%lu)", cpu,
+			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
+			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
+	}
+	sprintf(page, "\n");
+}
+
+static void srcu_torture_synchronize_expedited(void)
+{
+	synchronize_srcu_expedited(&srcu_ctl);
+}
+
+static struct rcu_torture_ops srcu_ops = {
+	.init		= rcu_sync_torture_init,
+	.readlock	= srcu_torture_read_lock,
+	.read_delay	= srcu_read_delay,
+	.readunlock	= srcu_torture_read_unlock,
+	.completed	= srcu_torture_completed,
+	.deferred_free	= srcu_torture_deferred_free,
+	.sync		= srcu_torture_synchronize,
+	.exp_sync	= srcu_torture_synchronize_expedited,
+	.call		= srcu_torture_call,
+	.cb_barrier	= srcu_torture_barrier,
+	.stats		= srcu_torture_stats,
+	.name		= "srcu"
+};
+
+/*
+ * Definitions for sched torture testing.
+ */
+
+static int sched_torture_read_lock(void)
+{
+	preempt_disable();
+	return 0;
+}
+
+static void sched_torture_read_unlock(int idx)
+{
+	preempt_enable();
+}
+
+static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
+{
+	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
+}
+
+static struct rcu_torture_ops sched_ops = {
+	.init		= rcu_sync_torture_init,
+	.readlock	= sched_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= sched_torture_read_unlock,
+	.completed	= rcu_no_completed,
+	.deferred_free	= rcu_sched_torture_deferred_free,
+	.sync		= synchronize_sched,
+	.exp_sync	= synchronize_sched_expedited,
+	.call		= call_rcu_sched,
+	.cb_barrier	= rcu_barrier_sched,
+	.fqs		= rcu_sched_force_quiescent_state,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "sched"
+};
+
+/*
+ * RCU torture priority-boost testing.  Runs one real-time thread per
+ * CPU for moderate bursts, repeatedly registering RCU callbacks and
+ * spinning waiting for them to be invoked.  If a given callback takes
+ * too long to be invoked, we assume that priority inversion has occurred.
+ */
+
+struct rcu_boost_inflight {
+	struct rcu_head rcu;
+	int inflight;
+};
+
+static void rcu_torture_boost_cb(struct rcu_head *head)
+{
+	struct rcu_boost_inflight *rbip =
+		container_of(head, struct rcu_boost_inflight, rcu);
+
+	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
+	rbip->inflight = 0;
+}
+
+static int rcu_torture_boost(void *arg)
+{
+	unsigned long call_rcu_time;
+	unsigned long endtime;
+	unsigned long oldstarttime;
+	struct rcu_boost_inflight rbi = { .inflight = 0 };
+	struct sched_param sp;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+
+	/* Set real-time priority. */
+	sp.sched_priority = 1;
+	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+		n_rcu_torture_boost_rterror++;
+	}
+
+	init_rcu_head_on_stack(&rbi.rcu);
+	/* Each pass through the following loop does one boost-test cycle. */
+	do {
+		/* Wait for the next test interval. */
+		oldstarttime = boost_starttime;
+		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
+			schedule_timeout_interruptible(oldstarttime - jiffies);
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/* Do one boost-test interval. */
+		endtime = oldstarttime + test_boost_duration * HZ;
+		call_rcu_time = jiffies;
+		while (ULONG_CMP_LT(jiffies, endtime)) {
+			/* If we don't have a callback in flight, post one. */
+			if (!rbi.inflight) {
+				smp_mb(); /* RCU core before ->inflight = 1. */
+				rbi.inflight = 1;
+				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
+				if (jiffies - call_rcu_time >
+					 test_boost_duration * HZ - HZ / 2) {
+					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+					n_rcu_torture_boost_failure++;
+				}
+				call_rcu_time = jiffies;
+			}
+			cond_resched();
+			rcu_stutter_wait("rcu_torture_boost");
+			if (kthread_should_stop() ||
+			    fullstop != FULLSTOP_DONTSTOP)
+				goto checkwait;
+		}
+
+		/*
+		 * Set the start time of the next test interval.
+		 * Yes, this is vulnerable to long delays, but such
+		 * delays simply cause a false negative for the next
+		 * interval.  Besides, we are running at RT priority,
+		 * so delays should be relatively rare.
+		 */
+		while (oldstarttime == boost_starttime &&
+		       !kthread_should_stop()) {
+			if (mutex_trylock(&boost_mutex)) {
+				boost_starttime = jiffies +
+						  test_boost_interval * HZ;
+				n_rcu_torture_boosts++;
+				mutex_unlock(&boost_mutex);
+				break;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+
+		/* Go do the stutter. */
+checkwait:	rcu_stutter_wait("rcu_torture_boost");
+	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+
+	/* Clean up and exit. */
+	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_boost");
+	while (!kthread_should_stop() || rbi.inflight)
+		schedule_timeout_uninterruptible(1);
+	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+	destroy_rcu_head_on_stack(&rbi.rcu);
+	return 0;
+}
+
+/*
+ * RCU torture force-quiescent-state kthread.  Repeatedly induces
+ * bursts of calls to force_quiescent_state(), increasing the probability
+ * of occurrence of some important types of race conditions.
+ */
+static int
+rcu_torture_fqs(void *arg)
+{
+	unsigned long fqs_resume_time;
+	int fqs_burst_remaining;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+	do {
+		fqs_resume_time = jiffies + fqs_stutter * HZ;
+		while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+		       !kthread_should_stop()) {
+			schedule_timeout_interruptible(1);
+		}
+		fqs_burst_remaining = fqs_duration;
+		while (fqs_burst_remaining > 0 &&
+		       !kthread_should_stop()) {
+			cur_ops->fqs();
+			udelay(fqs_holdoff);
+			fqs_burst_remaining -= fqs_holdoff;
+		}
+		rcu_stutter_wait("rcu_torture_fqs");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_fqs");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	bool exp;
+	int i;
+	struct rcu_torture *rp;
+	struct rcu_torture *rp1;
+	struct rcu_torture *old_rp;
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	set_user_nice(current, 19);
+
+	do {
+		schedule_timeout_uninterruptible(1);
+		rp = rcu_torture_alloc();
+		if (rp == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(torture_random(&rand) & 0x3ff);
+		old_rp = rcu_dereference_check(rcu_torture_current,
+					       current == writer_task);
+		rp->rtort_mbtest = 1;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
+		if (old_rp) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			if (gp_normal == gp_exp)
+				exp = !!(torture_random(&rand) & 0x80);
+			else
+				exp = gp_exp;
+			if (!exp) {
+				cur_ops->deferred_free(old_rp);
+			} else {
+				cur_ops->exp_sync();
+				list_add(&old_rp->rtort_free,
+					 &rcu_torture_removed);
+				list_for_each_entry_safe(rp, rp1,
+							 &rcu_torture_removed,
+							 rtort_free) {
+					i = rp->rtort_pipe_count;
+					if (i > RCU_TORTURE_PIPE_LEN)
+						i = RCU_TORTURE_PIPE_LEN;
+					atomic_inc(&rcu_torture_wcount[i]);
+					if (++rp->rtort_pipe_count >=
+					    RCU_TORTURE_PIPE_LEN) {
+						rp->rtort_mbtest = 0;
+						list_del(&rp->rtort_free);
+						rcu_torture_free(rp);
+					}
+				 }
+			}
+		}
+		rcutorture_record_progress(++rcu_torture_current_version);
+		rcu_stutter_wait("rcu_torture_writer");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_writer");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture fake writer kthread.  Repeatedly calls sync, with a random
+ * delay between calls.
+ */
+static int
+rcu_torture_fakewriter(void *arg)
+{
+	DEFINE_TORTURE_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+	set_user_nice(current, 19);
+
+	do {
+		schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+		udelay(torture_random(&rand) & 0x3ff);
+		if (cur_ops->cb_barrier != NULL &&
+		    torture_random(&rand) % (nfakewriters * 8) == 0) {
+			cur_ops->cb_barrier();
+		} else if (gp_normal == gp_exp) {
+			if (torture_random(&rand) & 0x80)
+				cur_ops->sync();
+			else
+				cur_ops->exp_sync();
+		} else if (gp_normal) {
+			cur_ops->sync();
+		} else {
+			cur_ops->exp_sync();
+		}
+		rcu_stutter_wait("rcu_torture_fakewriter");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+void rcutorture_trace_dump(void)
+{
+	static atomic_t beenhere = ATOMIC_INIT(0);
+
+	if (atomic_read(&beenhere))
+		return;
+	if (atomic_xchg(&beenhere, 1) != 0)
+		return;
+	ftrace_dump(DUMP_ALL);
+}
+
+/*
+ * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static void rcu_torture_timer(unsigned long unused)
+{
+	int idx;
+	int completed;
+	int completed_end;
+	static DEFINE_TORTURE_RANDOM(rand);
+	static DEFINE_SPINLOCK(rand_lock);
+	struct rcu_torture *p;
+	int pipe_count;
+	unsigned long long ts;
+
+	idx = cur_ops->readlock();
+	completed = cur_ops->completed();
+	ts = rcu_trace_clock_local();
+	p = rcu_dereference_check(rcu_torture_current,
+				  rcu_read_lock_bh_held() ||
+				  rcu_read_lock_sched_held() ||
+				  srcu_read_lock_held(&srcu_ctl));
+	if (p == NULL) {
+		/* Leave because rcu_torture_writer is not yet underway */
+		cur_ops->readunlock(idx);
+		return;
+	}
+	if (p->rtort_mbtest == 0)
+		atomic_inc(&n_rcu_torture_mberror);
+	spin_lock(&rand_lock);
+	cur_ops->read_delay(&rand);
+	n_rcu_torture_timers++;
+	spin_unlock(&rand_lock);
+	preempt_disable();
+	pipe_count = p->rtort_pipe_count;
+	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+		/* Should not happen, but... */
+		pipe_count = RCU_TORTURE_PIPE_LEN;
+	}
+	completed_end = cur_ops->completed();
+	if (pipe_count > 1) {
+		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
+					  completed, completed_end);
+		rcutorture_trace_dump();
+	}
+	__this_cpu_inc(rcu_torture_count[pipe_count]);
+	completed = completed_end - completed;
+	if (completed > RCU_TORTURE_PIPE_LEN) {
+		/* Should not happen, but... */
+		completed = RCU_TORTURE_PIPE_LEN;
+	}
+	__this_cpu_inc(rcu_torture_batch[completed]);
+	preempt_enable();
+	cur_ops->readunlock(idx);
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	int completed_end;
+	int idx;
+	DEFINE_TORTURE_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+	struct timer_list t;
+	unsigned long long ts;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	set_user_nice(current, 19);
+	if (irqreader && cur_ops->irq_capable)
+		setup_timer_on_stack(&t, rcu_torture_timer, 0);
+
+	do {
+		if (irqreader && cur_ops->irq_capable) {
+			if (!timer_pending(&t))
+				mod_timer(&t, jiffies + 1);
+		}
+		idx = cur_ops->readlock();
+		completed = cur_ops->completed();
+		ts = rcu_trace_clock_local();
+		p = rcu_dereference_check(rcu_torture_current,
+					  rcu_read_lock_bh_held() ||
+					  rcu_read_lock_sched_held() ||
+					  srcu_read_lock_held(&srcu_ctl));
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			cur_ops->readunlock(idx);
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		if (p->rtort_mbtest == 0)
+			atomic_inc(&n_rcu_torture_mberror);
+		cur_ops->read_delay(&rand);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		completed_end = cur_ops->completed();
+		if (pipe_count > 1) {
+			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
+						  ts, completed, completed_end);
+			rcutorture_trace_dump();
+		}
+		__this_cpu_inc(rcu_torture_count[pipe_count]);
+		completed = completed_end - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		__this_cpu_inc(rcu_torture_batch[completed]);
+		preempt_enable();
+		cur_ops->readunlock(idx);
+		schedule();
+		rcu_stutter_wait("rcu_torture_reader");
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_reader");
+	if (irqreader && cur_ops->irq_capable)
+		del_timer_sync(&t);
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static void
+rcu_torture_printk(char *page)
+{
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+	page += sprintf(page,
+		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free));
+	page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
+		       atomic_read(&n_rcu_torture_mberror),
+		       n_rcu_torture_boost_ktrerror,
+		       n_rcu_torture_boost_rterror);
+	page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
+		       n_rcu_torture_boost_failure,
+		       n_rcu_torture_boosts,
+		       n_rcu_torture_timers);
+	page += sprintf(page,
+		       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+		       n_online_successes, n_online_attempts,
+		       n_offline_successes, n_offline_attempts,
+		       min_online, max_online,
+		       min_offline, max_offline,
+		       sum_online, sum_offline, HZ);
+	page += sprintf(page, "barrier: %ld/%ld:%ld",
+		       n_barrier_successes,
+		       n_barrier_attempts,
+		       n_rcu_torture_barrier_error);
+	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
+	    n_rcu_torture_barrier_error != 0 ||
+	    n_rcu_torture_boost_ktrerror != 0 ||
+	    n_rcu_torture_boost_rterror != 0 ||
+	    n_rcu_torture_boost_failure != 0 ||
+	    i > 1) {
+		page += sprintf(page, "!!! ");
+		atomic_inc(&n_rcu_torture_error);
+		WARN_ON_ONCE(1);
+	}
+	page += sprintf(page, "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		page += sprintf(page, " %ld", pipesummary[i]);
+	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+	page += sprintf(page, "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		page += sprintf(page, " %ld", batchsummary[i]);
+	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
+	page += sprintf(page, "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		page += sprintf(page, " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	page += sprintf(page, "\n");
+	if (cur_ops->stats)
+		cur_ops->stats(page);
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int size = nr_cpu_ids * 200 + 8192;
+	char *buf;
+
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		pr_err("rcu-torture: Out of memory, need: %d", size);
+		return;
+	}
+	rcu_torture_printk(buf);
+	pr_alert("%s", buf);
+	kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+		rcutorture_shutdown_absorb("rcu_torture_stats");
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
+
+/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
+ * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
+ */
+static void rcu_torture_shuffle_tasks(void)
+{
+	int i;
+
+	cpumask_setall(shuffle_tmp_mask);
+	get_online_cpus();
+
+	/* No point in shuffling if there is only one online CPU (ex: UP) */
+	if (num_online_cpus() == 1) {
+		put_online_cpus();
+		return;
+	}
+
+	if (rcu_idle_cpu != -1)
+		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
+
+	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
+
+	if (reader_tasks) {
+		for (i = 0; i < nrealreaders; i++)
+			if (reader_tasks[i])
+				set_cpus_allowed_ptr(reader_tasks[i],
+						     shuffle_tmp_mask);
+	}
+	if (fakewriter_tasks) {
+		for (i = 0; i < nfakewriters; i++)
+			if (fakewriter_tasks[i])
+				set_cpus_allowed_ptr(fakewriter_tasks[i],
+						     shuffle_tmp_mask);
+	}
+	if (writer_task)
+		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
+	if (stats_task)
+		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
+	if (stutter_task)
+		set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
+	if (fqs_task)
+		set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
+	if (shutdown_task)
+		set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
+#ifdef CONFIG_HOTPLUG_CPU
+	if (onoff_task)
+		set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+	if (stall_task)
+		set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
+	if (barrier_cbs_tasks)
+		for (i = 0; i < n_barrier_cbs; i++)
+			if (barrier_cbs_tasks[i])
+				set_cpus_allowed_ptr(barrier_cbs_tasks[i],
+						     shuffle_tmp_mask);
+	if (barrier_task)
+		set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
+
+	if (rcu_idle_cpu == -1)
+		rcu_idle_cpu = num_online_cpus() - 1;
+	else
+		rcu_idle_cpu--;
+
+	put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int
+rcu_torture_shuffle(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
+	do {
+		schedule_timeout_interruptible(shuffle_interval * HZ);
+		rcu_torture_shuffle_tasks();
+		rcutorture_shutdown_absorb("rcu_torture_shuffle");
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
+	return 0;
+}
+
+/* Cause the rcutorture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int
+rcu_torture_stutter(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
+	do {
+		schedule_timeout_interruptible(stutter * HZ);
+		stutter_pause_test = 1;
+		if (!kthread_should_stop())
+			schedule_timeout_interruptible(stutter * HZ);
+		stutter_pause_test = 0;
+		rcutorture_shutdown_absorb("rcu_torture_stutter");
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+	return 0;
+}
+
+static inline void
+rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" TORTURE_FLAG
+		 "--- %s: nreaders=%d nfakewriters=%d "
+		 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+		 "shuffle_interval=%d stutter=%d irqreader=%d "
+		 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+		 "test_boost=%d/%d test_boost_interval=%d "
+		 "test_boost_duration=%d shutdown_secs=%d "
+		 "stall_cpu=%d stall_cpu_holdoff=%d "
+		 "n_barrier_cbs=%d "
+		 "onoff_interval=%d onoff_holdoff=%d\n",
+		 torture_type, tag, nrealreaders, nfakewriters,
+		 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+		 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+		 test_boost, cur_ops->can_boost,
+		 test_boost_interval, test_boost_duration, shutdown_secs,
+		 stall_cpu, stall_cpu_holdoff,
+		 n_barrier_cbs,
+		 onoff_interval, onoff_holdoff);
+}
+
+static struct notifier_block rcutorture_shutdown_nb = {
+	.notifier_call = rcutorture_shutdown_notify,
+};
+
+static void rcutorture_booster_cleanup(int cpu)
+{
+	struct task_struct *t;
+
+	if (boost_tasks[cpu] == NULL)
+		return;
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+	t = boost_tasks[cpu];
+	boost_tasks[cpu] = NULL;
+	mutex_unlock(&boost_mutex);
+
+	/* This must be outside of the mutex, otherwise deadlock! */
+	kthread_stop(t);
+	boost_tasks[cpu] = NULL;
+}
+
+static int rcutorture_booster_init(int cpu)
+{
+	int retval;
+
+	if (boost_tasks[cpu] != NULL)
+		return 0;  /* Already created, nothing more to do. */
+
+	/* Don't allow time recalculation while creating a new task. */
+	mutex_lock(&boost_mutex);
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+						  cpu_to_node(cpu),
+						  "rcu_torture_boost");
+	if (IS_ERR(boost_tasks[cpu])) {
+		retval = PTR_ERR(boost_tasks[cpu]);
+		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+		n_rcu_torture_boost_ktrerror++;
+		boost_tasks[cpu] = NULL;
+		mutex_unlock(&boost_mutex);
+		return retval;
+	}
+	kthread_bind(boost_tasks[cpu], cpu);
+	wake_up_process(boost_tasks[cpu]);
+	mutex_unlock(&boost_mutex);
+	return 0;
+}
+
+/*
+ * Cause the rcutorture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs module parameter.
+ */
+static int
+rcu_torture_shutdown(void *arg)
+{
+	long delta;
+	unsigned long jiffies_snap;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+	jiffies_snap = ACCESS_ONCE(jiffies);
+	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+	       !kthread_should_stop()) {
+		delta = shutdown_time - jiffies_snap;
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "rcu_torture_shutdown task: %lu jiffies remaining\n",
+				 torture_type, delta);
+		schedule_timeout_interruptible(delta);
+		jiffies_snap = ACCESS_ONCE(jiffies);
+	}
+	if (kthread_should_stop()) {
+		VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+		return 0;
+	}
+
+	/* OK, shut down the system. */
+
+	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
+	rcu_torture_cleanup();	/* Get the success/failure message. */
+	kernel_power_off();	/* Shut down the system. */
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+rcu_torture_onoff(void *arg)
+{
+	int cpu;
+	unsigned long delta;
+	int maxcpu = -1;
+	DEFINE_TORTURE_RANDOM(rand);
+	int ret;
+	unsigned long starttime;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+	for_each_online_cpu(cpu)
+		maxcpu = cpu;
+	WARN_ON(maxcpu < 0);
+	if (onoff_holdoff > 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
+		schedule_timeout_interruptible(onoff_holdoff * HZ);
+		VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
+	}
+	while (!kthread_should_stop()) {
+		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+			if (verbose)
+				pr_alert("%s" TORTURE_FLAG
+					 "rcu_torture_onoff task: offlining %d\n",
+					 torture_type, cpu);
+			starttime = jiffies;
+			n_offline_attempts++;
+			ret = cpu_down(cpu);
+			if (ret) {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "rcu_torture_onoff task: offline %d failed: errno %d\n",
+						 torture_type, cpu, ret);
+			} else {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "rcu_torture_onoff task: offlined %d\n",
+						 torture_type, cpu);
+				n_offline_successes++;
+				delta = jiffies - starttime;
+				sum_offline += delta;
+				if (min_offline < 0) {
+					min_offline = delta;
+					max_offline = delta;
+				}
+				if (min_offline > delta)
+					min_offline = delta;
+				if (max_offline < delta)
+					max_offline = delta;
+			}
+		} else if (cpu_is_hotpluggable(cpu)) {
+			if (verbose)
+				pr_alert("%s" TORTURE_FLAG
+					 "rcu_torture_onoff task: onlining %d\n",
+					 torture_type, cpu);
+			starttime = jiffies;
+			n_online_attempts++;
+			ret = cpu_up(cpu);
+			if (ret) {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "rcu_torture_onoff task: online %d failed: errno %d\n",
+						 torture_type, cpu, ret);
+			} else {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "rcu_torture_onoff task: onlined %d\n",
+						 torture_type, cpu);
+				n_online_successes++;
+				delta = jiffies - starttime;
+				sum_online += delta;
+				if (min_online < 0) {
+					min_online = delta;
+					max_online = delta;
+				}
+				if (min_online > delta)
+					min_online = delta;
+				if (max_online < delta)
+					max_online = delta;
+			}
+		}
+		schedule_timeout_interruptible(onoff_interval * HZ);
+	}
+	VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+	return 0;
+}
+
+static int
+rcu_torture_onoff_init(void)
+{
+	int ret;
+
+	if (onoff_interval <= 0)
+		return 0;
+	onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
+	if (IS_ERR(onoff_task)) {
+		ret = PTR_ERR(onoff_task);
+		onoff_task = NULL;
+		return ret;
+	}
+	return 0;
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+	if (onoff_task == NULL)
+		return;
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+	kthread_stop(onoff_task);
+	onoff_task = NULL;
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static int
+rcu_torture_onoff_init(void)
+{
+	return 0;
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
+ * induces a CPU stall for the time specified by stall_cpu.
+ */
+static int rcu_torture_stall(void *args)
+{
+	unsigned long stop_at;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+	if (stall_cpu_holdoff > 0) {
+		VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
+		VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+	}
+	if (!kthread_should_stop()) {
+		stop_at = get_seconds() + stall_cpu;
+		/* RCU CPU stall is expected behavior in following code. */
+		pr_alert("rcu_torture_stall start.\n");
+		rcu_read_lock();
+		preempt_disable();
+		while (ULONG_CMP_LT(get_seconds(), stop_at))
+			continue;  /* Induce RCU CPU stall warning. */
+		preempt_enable();
+		rcu_read_unlock();
+		pr_alert("rcu_torture_stall end.\n");
+	}
+	rcutorture_shutdown_absorb("rcu_torture_stall");
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(10 * HZ);
+	return 0;
+}
+
+/* Spawn CPU-stall kthread, if stall_cpu specified. */
+static int __init rcu_torture_stall_init(void)
+{
+	int ret;
+
+	if (stall_cpu <= 0)
+		return 0;
+	stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
+	if (IS_ERR(stall_task)) {
+		ret = PTR_ERR(stall_task);
+		stall_task = NULL;
+		return ret;
+	}
+	return 0;
+}
+
+/* Clean up after the CPU-stall kthread, if one was spawned. */
+static void rcu_torture_stall_cleanup(void)
+{
+	if (stall_task == NULL)
+		return;
+	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
+	kthread_stop(stall_task);
+	stall_task = NULL;
+}
+
+/* Callback function for RCU barrier testing. */
+void rcu_torture_barrier_cbf(struct rcu_head *rcu)
+{
+	atomic_inc(&barrier_cbs_invoked);
+}
+
+/* kthread function to register callbacks used to test RCU barriers. */
+static int rcu_torture_barrier_cbs(void *arg)
+{
+	long myid = (long)arg;
+	bool lastphase = 0;
+	bool newphase;
+	struct rcu_head rcu;
+
+	init_rcu_head_on_stack(&rcu);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+	set_user_nice(current, 19);
+	do {
+		wait_event(barrier_cbs_wq[myid],
+			   (newphase =
+			    ACCESS_ONCE(barrier_phase)) != lastphase ||
+			   kthread_should_stop() ||
+			   fullstop != FULLSTOP_DONTSTOP);
+		lastphase = newphase;
+		smp_mb(); /* ensure barrier_phase load before ->call(). */
+		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+			break;
+		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+		if (atomic_dec_and_test(&barrier_cbs_count))
+			wake_up(&barrier_wq);
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(1);
+	cur_ops->cb_barrier();
+	destroy_rcu_head_on_stack(&rcu);
+	return 0;
+}
+
+/* kthread function to drive and coordinate RCU barrier testing. */
+static int rcu_torture_barrier(void *arg)
+{
+	int i;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+	do {
+		atomic_set(&barrier_cbs_invoked, 0);
+		atomic_set(&barrier_cbs_count, n_barrier_cbs);
+		smp_mb(); /* Ensure barrier_phase after prior assignments. */
+		barrier_phase = !barrier_phase;
+		for (i = 0; i < n_barrier_cbs; i++)
+			wake_up(&barrier_cbs_wq[i]);
+		wait_event(barrier_wq,
+			   atomic_read(&barrier_cbs_count) == 0 ||
+			   kthread_should_stop() ||
+			   fullstop != FULLSTOP_DONTSTOP);
+		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+			break;
+		n_barrier_attempts++;
+		cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
+		if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
+			n_rcu_torture_barrier_error++;
+			WARN_ON_ONCE(1);
+		}
+		n_barrier_successes++;
+		schedule_timeout_interruptible(HZ / 10);
+	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+	rcutorture_shutdown_absorb("rcu_torture_barrier");
+	while (!kthread_should_stop())
+		schedule_timeout_interruptible(1);
+	return 0;
+}
+
+/* Initialize RCU barrier testing. */
+static int rcu_torture_barrier_init(void)
+{
+	int i;
+	int ret;
+
+	if (n_barrier_cbs == 0)
+		return 0;
+	if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
+		pr_alert("%s" TORTURE_FLAG
+			 " Call or barrier ops missing for %s,\n",
+			 torture_type, cur_ops->name);
+		pr_alert("%s" TORTURE_FLAG
+			 " RCU barrier testing omitted from run.\n",
+			 torture_type);
+		return 0;
+	}
+	atomic_set(&barrier_cbs_count, 0);
+	atomic_set(&barrier_cbs_invoked, 0);
+	barrier_cbs_tasks =
+		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
+			GFP_KERNEL);
+	barrier_cbs_wq =
+		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
+			GFP_KERNEL);
+	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
+		return -ENOMEM;
+	for (i = 0; i < n_barrier_cbs; i++) {
+		init_waitqueue_head(&barrier_cbs_wq[i]);
+		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
+						   (void *)(long)i,
+						   "rcu_torture_barrier_cbs");
+		if (IS_ERR(barrier_cbs_tasks[i])) {
+			ret = PTR_ERR(barrier_cbs_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+			barrier_cbs_tasks[i] = NULL;
+			return ret;
+		}
+	}
+	barrier_task = kthread_run(rcu_torture_barrier, NULL,
+				   "rcu_torture_barrier");
+	if (IS_ERR(barrier_task)) {
+		ret = PTR_ERR(barrier_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+		barrier_task = NULL;
+	}
+	return 0;
+}
+
+/* Clean up after RCU barrier testing. */
+static void rcu_torture_barrier_cleanup(void)
+{
+	int i;
+
+	if (barrier_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+		kthread_stop(barrier_task);
+		barrier_task = NULL;
+	}
+	if (barrier_cbs_tasks != NULL) {
+		for (i = 0; i < n_barrier_cbs; i++) {
+			if (barrier_cbs_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+				kthread_stop(barrier_cbs_tasks[i]);
+				barrier_cbs_tasks[i] = NULL;
+			}
+		}
+		kfree(barrier_cbs_tasks);
+		barrier_cbs_tasks = NULL;
+	}
+	if (barrier_cbs_wq != NULL) {
+		kfree(barrier_cbs_wq);
+		barrier_cbs_wq = NULL;
+	}
+}
+
+static int rcutorture_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		(void)rcutorture_booster_init(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		rcutorture_booster_cleanup(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block rcutorture_cpu_nb = {
+	.notifier_call = rcutorture_cpu_notify,
+};
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	mutex_lock(&fullstop_mutex);
+	rcutorture_record_test_transition();
+	if (fullstop == FULLSTOP_SHUTDOWN) {
+		pr_warn(/* but going down anyway, so... */
+		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
+		mutex_unlock(&fullstop_mutex);
+		schedule_timeout_uninterruptible(10);
+		if (cur_ops->cb_barrier != NULL)
+			cur_ops->cb_barrier();
+		return;
+	}
+	fullstop = FULLSTOP_RMMOD;
+	mutex_unlock(&fullstop_mutex);
+	unregister_reboot_notifier(&rcutorture_shutdown_nb);
+	rcu_torture_barrier_cleanup();
+	rcu_torture_stall_cleanup();
+	if (stutter_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
+		kthread_stop(stutter_task);
+	}
+	stutter_task = NULL;
+	if (shuffler_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+		kthread_stop(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
+	}
+	shuffler_task = NULL;
+
+	if (writer_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i]) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (fakewriter_tasks) {
+		for (i = 0; i < nfakewriters; i++) {
+			if (fakewriter_tasks[i]) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_fakewriter task");
+				kthread_stop(fakewriter_tasks[i]);
+			}
+			fakewriter_tasks[i] = NULL;
+		}
+		kfree(fakewriter_tasks);
+		fakewriter_tasks = NULL;
+	}
+
+	if (stats_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	if (fqs_task) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+		kthread_stop(fqs_task);
+	}
+	fqs_task = NULL;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+		unregister_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i)
+			rcutorture_booster_cleanup(i);
+	}
+	if (shutdown_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+		kthread_stop(shutdown_task);
+	}
+	shutdown_task = NULL;
+	rcu_torture_onoff_cleanup();
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	if (cur_ops->cb_barrier != NULL)
+		cur_ops->cb_barrier();
+
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+
+	if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
+		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
+	else if (n_online_successes != n_online_attempts ||
+		 n_offline_successes != n_offline_attempts)
+		rcu_torture_print_module_parms(cur_ops,
+					       "End of test: RCU_HOTPLUG");
+	else
+		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
+}
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+static void rcu_torture_leak_cb(struct rcu_head *rhp)
+{
+}
+
+static void rcu_torture_err_cb(struct rcu_head *rhp)
+{
+	/*
+	 * This -might- happen due to race conditions, but is unlikely.
+	 * The scenario that leads to this happening is that the
+	 * first of the pair of duplicate callbacks is queued,
+	 * someone else starts a grace period that includes that
+	 * callback, then the second of the pair must wait for the
+	 * next grace period.  Unlikely, but can happen.  If it
+	 * does happen, the debug-objects subsystem won't have splatted.
+	 */
+	pr_alert("rcutorture: duplicated callback was invoked.\n");
+}
+#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+/*
+ * Verify that double-free causes debug-objects to complain, but only
+ * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.  Otherwise, say that the test
+ * cannot be carried out.
+ */
+static void rcu_test_debug_objects(void)
+{
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+	struct rcu_head rh1;
+	struct rcu_head rh2;
+
+	init_rcu_head_on_stack(&rh1);
+	init_rcu_head_on_stack(&rh2);
+	pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
+
+	/* Try to queue the rh2 pair of callbacks for the same grace period. */
+	preempt_disable(); /* Prevent preemption from interrupting test. */
+	rcu_read_lock(); /* Make it impossible to finish a grace period. */
+	call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+	local_irq_disable(); /* Make it harder to start a new grace period. */
+	call_rcu(&rh2, rcu_torture_leak_cb);
+	call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+	local_irq_enable();
+	rcu_read_unlock();
+	preempt_enable();
+
+	/* Wait for them all to get done so we can safely return. */
+	rcu_barrier();
+	pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
+	destroy_rcu_head_on_stack(&rh1);
+	destroy_rcu_head_on_stack(&rh2);
+#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+	pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
+#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+}
+
+static int __init
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+	int retval;
+	static struct rcu_torture_ops *torture_ops[] = {
+		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+	};
+
+	mutex_lock(&fullstop_mutex);
+
+	/* Process args and tell the world that the torturer is on the job. */
+	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+		cur_ops = torture_ops[i];
+		if (strcmp(torture_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(torture_ops)) {
+		pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+			 torture_type);
+		pr_alert("rcu-torture types:");
+		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+			pr_alert(" %s", torture_ops[i]->name);
+		pr_alert("\n");
+		mutex_unlock(&fullstop_mutex);
+		return -EINVAL;
+	}
+	if (cur_ops->fqs == NULL && fqs_duration != 0) {
+		pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+		fqs_duration = 0;
+	}
+	if (cur_ops->init)
+		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	rcu_torture_print_module_parms(cur_ops, "Start of test");
+	fullstop = FULLSTOP_DONTSTOP;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
+		rcu_tortures[i].rtort_mbtest = 0;
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	atomic_set(&n_rcu_torture_mberror, 0);
+	atomic_set(&n_rcu_torture_error, 0);
+	n_rcu_torture_barrier_error = 0;
+	n_rcu_torture_boost_ktrerror = 0;
+	n_rcu_torture_boost_rterror = 0;
+	n_rcu_torture_boost_failure = 0;
+	n_rcu_torture_boosts = 0;
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_create(rcu_torture_writer, NULL,
+				     "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	wake_up_process(writer_task);
+	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
+				   GFP_KERNEL);
+	if (fakewriter_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nfakewriters; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
+						  "rcu_torture_fakewriter");
+		if (IS_ERR(fakewriter_tasks[i])) {
+			firsterr = PTR_ERR(fakewriter_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
+			fakewriter_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	if (test_no_idle_hz) {
+		rcu_idle_cpu = num_online_cpus() - 1;
+
+		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+			firsterr = -ENOMEM;
+			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+			goto unwind;
+		}
+
+		/* Create the shuffler thread */
+		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
+					  "rcu_torture_shuffle");
+		if (IS_ERR(shuffler_task)) {
+			free_cpumask_var(shuffle_tmp_mask);
+			firsterr = PTR_ERR(shuffler_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
+			shuffler_task = NULL;
+			goto unwind;
+		}
+	}
+	if (stutter < 0)
+		stutter = 0;
+	if (stutter) {
+		/* Create the stutter thread */
+		stutter_task = kthread_run(rcu_torture_stutter, NULL,
+					  "rcu_torture_stutter");
+		if (IS_ERR(stutter_task)) {
+			firsterr = PTR_ERR(stutter_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
+			stutter_task = NULL;
+			goto unwind;
+		}
+	}
+	if (fqs_duration < 0)
+		fqs_duration = 0;
+	if (fqs_duration) {
+		/* Create the stutter thread */
+		fqs_task = kthread_run(rcu_torture_fqs, NULL,
+				       "rcu_torture_fqs");
+		if (IS_ERR(fqs_task)) {
+			firsterr = PTR_ERR(fqs_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+			fqs_task = NULL;
+			goto unwind;
+		}
+	}
+	if (test_boost_interval < 1)
+		test_boost_interval = 1;
+	if (test_boost_duration < 2)
+		test_boost_duration = 2;
+	if ((test_boost == 1 && cur_ops->can_boost) ||
+	    test_boost == 2) {
+
+		boost_starttime = jiffies + test_boost_interval * HZ;
+		register_cpu_notifier(&rcutorture_cpu_nb);
+		for_each_possible_cpu(i) {
+			if (cpu_is_offline(i))
+				continue;  /* Heuristic: CPU can go offline. */
+			retval = rcutorture_booster_init(i);
+			if (retval < 0) {
+				firsterr = retval;
+				goto unwind;
+			}
+		}
+	}
+	if (shutdown_secs > 0) {
+		shutdown_time = jiffies + shutdown_secs * HZ;
+		shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
+					       "rcu_torture_shutdown");
+		if (IS_ERR(shutdown_task)) {
+			firsterr = PTR_ERR(shutdown_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+			shutdown_task = NULL;
+			goto unwind;
+		}
+		wake_up_process(shutdown_task);
+	}
+	i = rcu_torture_onoff_init();
+	if (i != 0) {
+		firsterr = i;
+		goto unwind;
+	}
+	register_reboot_notifier(&rcutorture_shutdown_nb);
+	i = rcu_torture_stall_init();
+	if (i != 0) {
+		firsterr = i;
+		goto unwind;
+	}
+	retval = rcu_torture_barrier_init();
+	if (retval != 0) {
+		firsterr = retval;
+		goto unwind;
+	}
+	if (object_debug)
+		rcu_test_debug_objects();
+	rcutorture_record_test_transition();
+	mutex_unlock(&fullstop_mutex);
+	return 0;
+
+unwind:
+	mutex_unlock(&fullstop_mutex);
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c
deleted file mode 100644
index dad67238d086..000000000000
--- a/kernel/rcu/torture.c
+++ /dev/null
@@ -1,2148 +0,0 @@
-/*
- * Read-Copy Update module-based torture test facility
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005, 2006
- *
- * Authors: Paul E. McKenney <paulmck@us.ibm.com>
- *	  Josh Triplett <josh@freedesktop.org>
- *
- * See also:  Documentation/RCU/torture.txt
- */
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <linux/trace_clock.h>
-#include <asm/byteorder.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutorture."
-
-static int fqs_duration;
-module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
-static int fqs_holdoff;
-module_param(fqs_holdoff, int, 0444);
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
-static int fqs_stutter = 3;
-module_param(fqs_stutter, int, 0444);
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-static bool gp_exp;
-module_param(gp_exp, bool, 0444);
-MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
-static bool gp_normal;
-module_param(gp_normal, bool, 0444);
-MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
-static int irqreader = 1;
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
-static int n_barrier_cbs;
-module_param(n_barrier_cbs, int, 0444);
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-static int nfakewriters = 4;
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-static int nreaders = -1;
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-static int object_debug;
-module_param(object_debug, int, 0444);
-MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
-static int onoff_holdoff;
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-static int onoff_interval;
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-static int shuffle_interval = 3;
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-static int shutdown_secs;
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
-static int stall_cpu;
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-static int stall_cpu_holdoff = 10;
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
-static int stat_interval = 60;
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-static int stutter = 5;
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-static int test_boost = 1;
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int test_boost_duration = 4;
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-static int test_boost_interval = 7;
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-static bool test_no_idle_hz = true;
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
-static char *torture_type = "rcu";
-module_param(torture_type, charp, 0444);
-MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-static bool verbose;
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
-	do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
-
-static int nrealreaders;
-static struct task_struct *writer_task;
-static struct task_struct **fakewriter_tasks;
-static struct task_struct **reader_tasks;
-static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-static struct task_struct *stutter_task;
-static struct task_struct *fqs_task;
-static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static struct task_struct *stall_task;
-static struct task_struct **barrier_cbs_tasks;
-static struct task_struct *barrier_task;
-
-#define RCU_TORTURE_PIPE_LEN 10
-
-struct rcu_torture {
-	struct rcu_head rtort_rcu;
-	int rtort_pipe_count;
-	struct list_head rtort_free;
-	int rtort_mbtest;
-};
-
-static LIST_HEAD(rcu_torture_freelist);
-static struct rcu_torture __rcu *rcu_torture_current;
-static unsigned long rcu_torture_current_version;
-static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
-static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_count) = { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
-		      rcu_torture_batch) = { 0 };
-static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
-static atomic_t n_rcu_torture_alloc;
-static atomic_t n_rcu_torture_alloc_fail;
-static atomic_t n_rcu_torture_free;
-static atomic_t n_rcu_torture_mberror;
-static atomic_t n_rcu_torture_error;
-static long n_rcu_torture_barrier_error;
-static long n_rcu_torture_boost_ktrerror;
-static long n_rcu_torture_boost_rterror;
-static long n_rcu_torture_boost_failure;
-static long n_rcu_torture_boosts;
-static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
-static long n_barrier_attempts;
-static long n_barrier_successes;
-static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
-
-static int stutter_pause_test;
-
-#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
-#define RCUTORTURE_RUNNABLE_INIT 1
-#else
-#define RCUTORTURE_RUNNABLE_INIT 0
-#endif
-int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-module_param(rcutorture_runnable, int, 0444);
-MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
-
-#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
-#define rcu_can_boost() 1
-#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-#define rcu_can_boost() 0
-#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
-
-#ifdef CONFIG_RCU_TRACE
-static u64 notrace rcu_trace_clock_local(void)
-{
-	u64 ts = trace_clock_local();
-	unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
-	return ts;
-}
-#else /* #ifdef CONFIG_RCU_TRACE */
-static u64 notrace rcu_trace_clock_local(void)
-{
-	return 0ULL;
-}
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
-static unsigned long shutdown_time;	/* jiffies to system shutdown. */
-static unsigned long boost_starttime;	/* jiffies of next boost test start. */
-DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
-					/*  and boost task create/destroy. */
-static atomic_t barrier_cbs_count;	/* Barrier callbacks registered. */
-static bool barrier_phase;		/* Test phase. */
-static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
-static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
-static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-
-/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
-
-#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
-			   unsigned long unused2, void *unused3)
-{
-	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_DONTSTOP)
-		fullstop = FULLSTOP_SHUTDOWN;
-	else
-		pr_warn(/* but going down anyway, so... */
-		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-	mutex_unlock(&fullstop_mutex);
-	return NOTIFY_DONE;
-}
-
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(const char *title)
-{
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
-		pr_notice(
-		       "rcutorture thread %s parking due to system shutdown\n",
-		       title);
-		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
-	}
-}
-
-/*
- * Allocate an element from the rcu_tortures pool.
- */
-static struct rcu_torture *
-rcu_torture_alloc(void)
-{
-	struct list_head *p;
-
-	spin_lock_bh(&rcu_torture_lock);
-	if (list_empty(&rcu_torture_freelist)) {
-		atomic_inc(&n_rcu_torture_alloc_fail);
-		spin_unlock_bh(&rcu_torture_lock);
-		return NULL;
-	}
-	atomic_inc(&n_rcu_torture_alloc);
-	p = rcu_torture_freelist.next;
-	list_del_init(p);
-	spin_unlock_bh(&rcu_torture_lock);
-	return container_of(p, struct rcu_torture, rtort_free);
-}
-
-/*
- * Free an element to the rcu_tortures pool.
- */
-static void
-rcu_torture_free(struct rcu_torture *p)
-{
-	atomic_inc(&n_rcu_torture_free);
-	spin_lock_bh(&rcu_torture_lock);
-	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
-	spin_unlock_bh(&rcu_torture_lock);
-}
-
-struct rcu_random_state {
-	unsigned long rrs_state;
-	long rrs_count;
-};
-
-#define RCU_RANDOM_MULT 39916801  /* prime */
-#define RCU_RANDOM_ADD	479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-
-/*
- * Crude but fast random-number generator.  Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
-	if (--rrsp->rrs_count < 0) {
-		rrsp->rrs_state += (unsigned long)local_clock();
-		rrsp->rrs_count = RCU_RANDOM_REFRESH;
-	}
-	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
-	return swahw32(rrsp->rrs_state);
-}
-
-static void
-rcu_stutter_wait(const char *title)
-{
-	while (stutter_pause_test || !rcutorture_runnable) {
-		if (rcutorture_runnable)
-			schedule_timeout_interruptible(1);
-		else
-			schedule_timeout_interruptible(round_jiffies_relative(HZ));
-		rcutorture_shutdown_absorb(title);
-	}
-}
-
-/*
- * Operations vector for selecting different types of tests.
- */
-
-struct rcu_torture_ops {
-	void (*init)(void);
-	int (*readlock)(void);
-	void (*read_delay)(struct rcu_random_state *rrsp);
-	void (*readunlock)(int idx);
-	int (*completed)(void);
-	void (*deferred_free)(struct rcu_torture *p);
-	void (*sync)(void);
-	void (*exp_sync)(void);
-	void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-	void (*cb_barrier)(void);
-	void (*fqs)(void);
-	void (*stats)(char *page);
-	int irq_capable;
-	int can_boost;
-	const char *name;
-};
-
-static struct rcu_torture_ops *cur_ops;
-
-/*
- * Definitions for rcu torture testing.
- */
-
-static int rcu_torture_read_lock(void) __acquires(RCU)
-{
-	rcu_read_lock();
-	return 0;
-}
-
-static void rcu_read_delay(struct rcu_random_state *rrsp)
-{
-	const unsigned long shortdelay_us = 200;
-	const unsigned long longdelay_ms = 50;
-
-	/* We want a short delay sometimes to make a reader delay the grace
-	 * period, and we want a long delay occasionally to trigger
-	 * force_quiescent_state. */
-
-	if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
-		mdelay(longdelay_ms);
-	if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
-		udelay(shortdelay_us);
-#ifdef CONFIG_PREEMPT
-	if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
-		preempt_schedule();  /* No QS if preempt_disable() in effect */
-#endif
-}
-
-static void rcu_torture_read_unlock(int idx) __releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static int rcu_torture_completed(void)
-{
-	return rcu_batches_completed();
-}
-
-static void
-rcu_torture_cb(struct rcu_head *p)
-{
-	int i;
-	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
-
-	if (fullstop != FULLSTOP_DONTSTOP) {
-		/* Test is ending, just drop callbacks on the floor. */
-		/* The next initialization will pick up the pieces. */
-		return;
-	}
-	i = rp->rtort_pipe_count;
-	if (i > RCU_TORTURE_PIPE_LEN)
-		i = RCU_TORTURE_PIPE_LEN;
-	atomic_inc(&rcu_torture_wcount[i]);
-	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
-		rp->rtort_mbtest = 0;
-		rcu_torture_free(rp);
-	} else {
-		cur_ops->deferred_free(rp);
-	}
-}
-
-static int rcu_no_completed(void)
-{
-	return 0;
-}
-
-static void rcu_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static void rcu_sync_torture_init(void)
-{
-	INIT_LIST_HEAD(&rcu_torture_removed);
-}
-
-static struct rcu_torture_ops rcu_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= rcu_torture_read_lock,
-	.read_delay	= rcu_read_delay,
-	.readunlock	= rcu_torture_read_unlock,
-	.completed	= rcu_torture_completed,
-	.deferred_free	= rcu_torture_deferred_free,
-	.sync		= synchronize_rcu,
-	.exp_sync	= synchronize_rcu_expedited,
-	.call		= call_rcu,
-	.cb_barrier	= rcu_barrier,
-	.fqs		= rcu_force_quiescent_state,
-	.stats		= NULL,
-	.irq_capable	= 1,
-	.can_boost	= rcu_can_boost(),
-	.name		= "rcu"
-};
-
-/*
- * Definitions for rcu_bh torture testing.
- */
-
-static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
-{
-	rcu_read_lock_bh();
-	return 0;
-}
-
-static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
-{
-	rcu_read_unlock_bh();
-}
-
-static int rcu_bh_torture_completed(void)
-{
-	return rcu_batches_completed_bh();
-}
-
-static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops rcu_bh_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= rcu_bh_torture_read_lock,
-	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock	= rcu_bh_torture_read_unlock,
-	.completed	= rcu_bh_torture_completed,
-	.deferred_free	= rcu_bh_torture_deferred_free,
-	.sync		= synchronize_rcu_bh,
-	.exp_sync	= synchronize_rcu_bh_expedited,
-	.call		= call_rcu_bh,
-	.cb_barrier	= rcu_barrier_bh,
-	.fqs		= rcu_bh_force_quiescent_state,
-	.stats		= NULL,
-	.irq_capable	= 1,
-	.name		= "rcu_bh"
-};
-
-/*
- * Definitions for srcu torture testing.
- */
-
-DEFINE_STATIC_SRCU(srcu_ctl);
-
-static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
-{
-	return srcu_read_lock(&srcu_ctl);
-}
-
-static void srcu_read_delay(struct rcu_random_state *rrsp)
-{
-	long delay;
-	const long uspertick = 1000000 / HZ;
-	const long longdelay = 10;
-
-	/* We want there to be long-running readers, but not all the time. */
-
-	delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
-	if (!delay)
-		schedule_timeout_interruptible(longdelay);
-	else
-		rcu_read_delay(rrsp);
-}
-
-static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
-{
-	srcu_read_unlock(&srcu_ctl, idx);
-}
-
-static int srcu_torture_completed(void)
-{
-	return srcu_batches_completed(&srcu_ctl);
-}
-
-static void srcu_torture_deferred_free(struct rcu_torture *rp)
-{
-	call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb);
-}
-
-static void srcu_torture_synchronize(void)
-{
-	synchronize_srcu(&srcu_ctl);
-}
-
-static void srcu_torture_call(struct rcu_head *head,
-			      void (*func)(struct rcu_head *head))
-{
-	call_srcu(&srcu_ctl, head, func);
-}
-
-static void srcu_torture_barrier(void)
-{
-	srcu_barrier(&srcu_ctl);
-}
-
-static void srcu_torture_stats(char *page)
-{
-	int cpu;
-	int idx = srcu_ctl.completed & 0x1;
-
-	page += sprintf(page, "%s%s per-CPU(idx=%d):",
-		       torture_type, TORTURE_FLAG, idx);
-	for_each_possible_cpu(cpu) {
-		page += sprintf(page, " %d(%lu,%lu)", cpu,
-			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx],
-			       per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]);
-	}
-	sprintf(page, "\n");
-}
-
-static void srcu_torture_synchronize_expedited(void)
-{
-	synchronize_srcu_expedited(&srcu_ctl);
-}
-
-static struct rcu_torture_ops srcu_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= srcu_torture_read_lock,
-	.read_delay	= srcu_read_delay,
-	.readunlock	= srcu_torture_read_unlock,
-	.completed	= srcu_torture_completed,
-	.deferred_free	= srcu_torture_deferred_free,
-	.sync		= srcu_torture_synchronize,
-	.exp_sync	= srcu_torture_synchronize_expedited,
-	.call		= srcu_torture_call,
-	.cb_barrier	= srcu_torture_barrier,
-	.stats		= srcu_torture_stats,
-	.name		= "srcu"
-};
-
-/*
- * Definitions for sched torture testing.
- */
-
-static int sched_torture_read_lock(void)
-{
-	preempt_disable();
-	return 0;
-}
-
-static void sched_torture_read_unlock(int idx)
-{
-	preempt_enable();
-}
-
-static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
-{
-	call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
-}
-
-static struct rcu_torture_ops sched_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= sched_torture_read_lock,
-	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
-	.readunlock	= sched_torture_read_unlock,
-	.completed	= rcu_no_completed,
-	.deferred_free	= rcu_sched_torture_deferred_free,
-	.sync		= synchronize_sched,
-	.exp_sync	= synchronize_sched_expedited,
-	.call		= call_rcu_sched,
-	.cb_barrier	= rcu_barrier_sched,
-	.fqs		= rcu_sched_force_quiescent_state,
-	.stats		= NULL,
-	.irq_capable	= 1,
-	.name		= "sched"
-};
-
-/*
- * RCU torture priority-boost testing.  Runs one real-time thread per
- * CPU for moderate bursts, repeatedly registering RCU callbacks and
- * spinning waiting for them to be invoked.  If a given callback takes
- * too long to be invoked, we assume that priority inversion has occurred.
- */
-
-struct rcu_boost_inflight {
-	struct rcu_head rcu;
-	int inflight;
-};
-
-static void rcu_torture_boost_cb(struct rcu_head *head)
-{
-	struct rcu_boost_inflight *rbip =
-		container_of(head, struct rcu_boost_inflight, rcu);
-
-	smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */
-	rbip->inflight = 0;
-}
-
-static int rcu_torture_boost(void *arg)
-{
-	unsigned long call_rcu_time;
-	unsigned long endtime;
-	unsigned long oldstarttime;
-	struct rcu_boost_inflight rbi = { .inflight = 0 };
-	struct sched_param sp;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
-
-	/* Set real-time priority. */
-	sp.sched_priority = 1;
-	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
-		n_rcu_torture_boost_rterror++;
-	}
-
-	init_rcu_head_on_stack(&rbi.rcu);
-	/* Each pass through the following loop does one boost-test cycle. */
-	do {
-		/* Wait for the next test interval. */
-		oldstarttime = boost_starttime;
-		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
-			schedule_timeout_interruptible(oldstarttime - jiffies);
-			rcu_stutter_wait("rcu_torture_boost");
-			if (kthread_should_stop() ||
-			    fullstop != FULLSTOP_DONTSTOP)
-				goto checkwait;
-		}
-
-		/* Do one boost-test interval. */
-		endtime = oldstarttime + test_boost_duration * HZ;
-		call_rcu_time = jiffies;
-		while (ULONG_CMP_LT(jiffies, endtime)) {
-			/* If we don't have a callback in flight, post one. */
-			if (!rbi.inflight) {
-				smp_mb(); /* RCU core before ->inflight = 1. */
-				rbi.inflight = 1;
-				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
-				if (jiffies - call_rcu_time >
-					 test_boost_duration * HZ - HZ / 2) {
-					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
-					n_rcu_torture_boost_failure++;
-				}
-				call_rcu_time = jiffies;
-			}
-			cond_resched();
-			rcu_stutter_wait("rcu_torture_boost");
-			if (kthread_should_stop() ||
-			    fullstop != FULLSTOP_DONTSTOP)
-				goto checkwait;
-		}
-
-		/*
-		 * Set the start time of the next test interval.
-		 * Yes, this is vulnerable to long delays, but such
-		 * delays simply cause a false negative for the next
-		 * interval.  Besides, we are running at RT priority,
-		 * so delays should be relatively rare.
-		 */
-		while (oldstarttime == boost_starttime &&
-		       !kthread_should_stop()) {
-			if (mutex_trylock(&boost_mutex)) {
-				boost_starttime = jiffies +
-						  test_boost_interval * HZ;
-				n_rcu_torture_boosts++;
-				mutex_unlock(&boost_mutex);
-				break;
-			}
-			schedule_timeout_uninterruptible(1);
-		}
-
-		/* Go do the stutter. */
-checkwait:	rcu_stutter_wait("rcu_torture_boost");
-	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
-
-	/* Clean up and exit. */
-	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_boost");
-	while (!kthread_should_stop() || rbi.inflight)
-		schedule_timeout_uninterruptible(1);
-	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
-	destroy_rcu_head_on_stack(&rbi.rcu);
-	return 0;
-}
-
-/*
- * RCU torture force-quiescent-state kthread.  Repeatedly induces
- * bursts of calls to force_quiescent_state(), increasing the probability
- * of occurrence of some important types of race conditions.
- */
-static int
-rcu_torture_fqs(void *arg)
-{
-	unsigned long fqs_resume_time;
-	int fqs_burst_remaining;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
-	do {
-		fqs_resume_time = jiffies + fqs_stutter * HZ;
-		while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
-		       !kthread_should_stop()) {
-			schedule_timeout_interruptible(1);
-		}
-		fqs_burst_remaining = fqs_duration;
-		while (fqs_burst_remaining > 0 &&
-		       !kthread_should_stop()) {
-			cur_ops->fqs();
-			udelay(fqs_holdoff);
-			fqs_burst_remaining -= fqs_holdoff;
-		}
-		rcu_stutter_wait("rcu_torture_fqs");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_fqs");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * RCU torture writer kthread.  Repeatedly substitutes a new structure
- * for that pointed to by rcu_torture_current, freeing the old structure
- * after a series of grace periods (the "pipeline").
- */
-static int
-rcu_torture_writer(void *arg)
-{
-	bool exp;
-	int i;
-	struct rcu_torture *rp;
-	struct rcu_torture *rp1;
-	struct rcu_torture *old_rp;
-	static DEFINE_RCU_RANDOM(rand);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
-	set_user_nice(current, 19);
-
-	do {
-		schedule_timeout_uninterruptible(1);
-		rp = rcu_torture_alloc();
-		if (rp == NULL)
-			continue;
-		rp->rtort_pipe_count = 0;
-		udelay(rcu_random(&rand) & 0x3ff);
-		old_rp = rcu_dereference_check(rcu_torture_current,
-					       current == writer_task);
-		rp->rtort_mbtest = 1;
-		rcu_assign_pointer(rcu_torture_current, rp);
-		smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
-		if (old_rp) {
-			i = old_rp->rtort_pipe_count;
-			if (i > RCU_TORTURE_PIPE_LEN)
-				i = RCU_TORTURE_PIPE_LEN;
-			atomic_inc(&rcu_torture_wcount[i]);
-			old_rp->rtort_pipe_count++;
-			if (gp_normal == gp_exp)
-				exp = !!(rcu_random(&rand) & 0x80);
-			else
-				exp = gp_exp;
-			if (!exp) {
-				cur_ops->deferred_free(old_rp);
-			} else {
-				cur_ops->exp_sync();
-				list_add(&old_rp->rtort_free,
-					 &rcu_torture_removed);
-				list_for_each_entry_safe(rp, rp1,
-							 &rcu_torture_removed,
-							 rtort_free) {
-					i = rp->rtort_pipe_count;
-					if (i > RCU_TORTURE_PIPE_LEN)
-						i = RCU_TORTURE_PIPE_LEN;
-					atomic_inc(&rcu_torture_wcount[i]);
-					if (++rp->rtort_pipe_count >=
-					    RCU_TORTURE_PIPE_LEN) {
-						rp->rtort_mbtest = 0;
-						list_del(&rp->rtort_free);
-						rcu_torture_free(rp);
-					}
-				 }
-			}
-		}
-		rcutorture_record_progress(++rcu_torture_current_version);
-		rcu_stutter_wait("rcu_torture_writer");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_writer");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * RCU torture fake writer kthread.  Repeatedly calls sync, with a random
- * delay between calls.
- */
-static int
-rcu_torture_fakewriter(void *arg)
-{
-	DEFINE_RCU_RANDOM(rand);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
-	set_user_nice(current, 19);
-
-	do {
-		schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
-		udelay(rcu_random(&rand) & 0x3ff);
-		if (cur_ops->cb_barrier != NULL &&
-		    rcu_random(&rand) % (nfakewriters * 8) == 0) {
-			cur_ops->cb_barrier();
-		} else if (gp_normal == gp_exp) {
-			if (rcu_random(&rand) & 0x80)
-				cur_ops->sync();
-			else
-				cur_ops->exp_sync();
-		} else if (gp_normal) {
-			cur_ops->sync();
-		} else {
-			cur_ops->exp_sync();
-		}
-		rcu_stutter_wait("rcu_torture_fakewriter");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-void rcutorture_trace_dump(void)
-{
-	static atomic_t beenhere = ATOMIC_INIT(0);
-
-	if (atomic_read(&beenhere))
-		return;
-	if (atomic_xchg(&beenhere, 1) != 0)
-		return;
-	ftrace_dump(DUMP_ALL);
-}
-
-/*
- * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array.  The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
- */
-static void rcu_torture_timer(unsigned long unused)
-{
-	int idx;
-	int completed;
-	int completed_end;
-	static DEFINE_RCU_RANDOM(rand);
-	static DEFINE_SPINLOCK(rand_lock);
-	struct rcu_torture *p;
-	int pipe_count;
-	unsigned long long ts;
-
-	idx = cur_ops->readlock();
-	completed = cur_ops->completed();
-	ts = rcu_trace_clock_local();
-	p = rcu_dereference_check(rcu_torture_current,
-				  rcu_read_lock_bh_held() ||
-				  rcu_read_lock_sched_held() ||
-				  srcu_read_lock_held(&srcu_ctl));
-	if (p == NULL) {
-		/* Leave because rcu_torture_writer is not yet underway */
-		cur_ops->readunlock(idx);
-		return;
-	}
-	if (p->rtort_mbtest == 0)
-		atomic_inc(&n_rcu_torture_mberror);
-	spin_lock(&rand_lock);
-	cur_ops->read_delay(&rand);
-	n_rcu_torture_timers++;
-	spin_unlock(&rand_lock);
-	preempt_disable();
-	pipe_count = p->rtort_pipe_count;
-	if (pipe_count > RCU_TORTURE_PIPE_LEN) {
-		/* Should not happen, but... */
-		pipe_count = RCU_TORTURE_PIPE_LEN;
-	}
-	completed_end = cur_ops->completed();
-	if (pipe_count > 1) {
-		do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts,
-					  completed, completed_end);
-		rcutorture_trace_dump();
-	}
-	__this_cpu_inc(rcu_torture_count[pipe_count]);
-	completed = completed_end - completed;
-	if (completed > RCU_TORTURE_PIPE_LEN) {
-		/* Should not happen, but... */
-		completed = RCU_TORTURE_PIPE_LEN;
-	}
-	__this_cpu_inc(rcu_torture_batch[completed]);
-	preempt_enable();
-	cur_ops->readunlock(idx);
-}
-
-/*
- * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
- * incrementing the corresponding element of the pipeline array.  The
- * counter in the element should never be greater than 1, otherwise, the
- * RCU implementation is broken.
- */
-static int
-rcu_torture_reader(void *arg)
-{
-	int completed;
-	int completed_end;
-	int idx;
-	DEFINE_RCU_RANDOM(rand);
-	struct rcu_torture *p;
-	int pipe_count;
-	struct timer_list t;
-	unsigned long long ts;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
-	set_user_nice(current, 19);
-	if (irqreader && cur_ops->irq_capable)
-		setup_timer_on_stack(&t, rcu_torture_timer, 0);
-
-	do {
-		if (irqreader && cur_ops->irq_capable) {
-			if (!timer_pending(&t))
-				mod_timer(&t, jiffies + 1);
-		}
-		idx = cur_ops->readlock();
-		completed = cur_ops->completed();
-		ts = rcu_trace_clock_local();
-		p = rcu_dereference_check(rcu_torture_current,
-					  rcu_read_lock_bh_held() ||
-					  rcu_read_lock_sched_held() ||
-					  srcu_read_lock_held(&srcu_ctl));
-		if (p == NULL) {
-			/* Wait for rcu_torture_writer to get underway */
-			cur_ops->readunlock(idx);
-			schedule_timeout_interruptible(HZ);
-			continue;
-		}
-		if (p->rtort_mbtest == 0)
-			atomic_inc(&n_rcu_torture_mberror);
-		cur_ops->read_delay(&rand);
-		preempt_disable();
-		pipe_count = p->rtort_pipe_count;
-		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
-			/* Should not happen, but... */
-			pipe_count = RCU_TORTURE_PIPE_LEN;
-		}
-		completed_end = cur_ops->completed();
-		if (pipe_count > 1) {
-			do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu,
-						  ts, completed, completed_end);
-			rcutorture_trace_dump();
-		}
-		__this_cpu_inc(rcu_torture_count[pipe_count]);
-		completed = completed_end - completed;
-		if (completed > RCU_TORTURE_PIPE_LEN) {
-			/* Should not happen, but... */
-			completed = RCU_TORTURE_PIPE_LEN;
-		}
-		__this_cpu_inc(rcu_torture_batch[completed]);
-		preempt_enable();
-		cur_ops->readunlock(idx);
-		schedule();
-		rcu_stutter_wait("rcu_torture_reader");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_reader");
-	if (irqreader && cur_ops->irq_capable)
-		del_timer_sync(&t);
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
-	return 0;
-}
-
-/*
- * Create an RCU-torture statistics message in the specified buffer.
- */
-static void
-rcu_torture_printk(char *page)
-{
-	int cpu;
-	int i;
-	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
-
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
-			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
-		}
-	}
-	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
-		if (pipesummary[i] != 0)
-			break;
-	}
-	page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
-	page += sprintf(page,
-		       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
-		       rcu_torture_current,
-		       rcu_torture_current_version,
-		       list_empty(&rcu_torture_freelist),
-		       atomic_read(&n_rcu_torture_alloc),
-		       atomic_read(&n_rcu_torture_alloc_fail),
-		       atomic_read(&n_rcu_torture_free));
-	page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ",
-		       atomic_read(&n_rcu_torture_mberror),
-		       n_rcu_torture_boost_ktrerror,
-		       n_rcu_torture_boost_rterror);
-	page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ",
-		       n_rcu_torture_boost_failure,
-		       n_rcu_torture_boosts,
-		       n_rcu_torture_timers);
-	page += sprintf(page,
-		       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
-		       n_online_successes, n_online_attempts,
-		       n_offline_successes, n_offline_attempts,
-		       min_online, max_online,
-		       min_offline, max_offline,
-		       sum_online, sum_offline, HZ);
-	page += sprintf(page, "barrier: %ld/%ld:%ld",
-		       n_barrier_successes,
-		       n_barrier_attempts,
-		       n_rcu_torture_barrier_error);
-	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
-	if (atomic_read(&n_rcu_torture_mberror) != 0 ||
-	    n_rcu_torture_barrier_error != 0 ||
-	    n_rcu_torture_boost_ktrerror != 0 ||
-	    n_rcu_torture_boost_rterror != 0 ||
-	    n_rcu_torture_boost_failure != 0 ||
-	    i > 1) {
-		page += sprintf(page, "!!! ");
-		atomic_inc(&n_rcu_torture_error);
-		WARN_ON_ONCE(1);
-	}
-	page += sprintf(page, "Reader Pipe: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		page += sprintf(page, " %ld", pipesummary[i]);
-	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
-	page += sprintf(page, "Reader Batch: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		page += sprintf(page, " %ld", batchsummary[i]);
-	page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG);
-	page += sprintf(page, "Free-Block Circulation: ");
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-		page += sprintf(page, " %d",
-			       atomic_read(&rcu_torture_wcount[i]));
-	}
-	page += sprintf(page, "\n");
-	if (cur_ops->stats)
-		cur_ops->stats(page);
-}
-
-/*
- * Print torture statistics.  Caller must ensure that there is only
- * one call to this function at a given time!!!  This is normally
- * accomplished by relying on the module system to only have one copy
- * of the module loaded, and then by giving the rcu_torture_stats
- * kthread full control (or the init/cleanup functions when rcu_torture_stats
- * thread is not running).
- */
-static void
-rcu_torture_stats_print(void)
-{
-	int size = nr_cpu_ids * 200 + 8192;
-	char *buf;
-
-	buf = kmalloc(size, GFP_KERNEL);
-	if (!buf) {
-		pr_err("rcu-torture: Out of memory, need: %d", size);
-		return;
-	}
-	rcu_torture_printk(buf);
-	pr_alert("%s", buf);
-	kfree(buf);
-}
-
-/*
- * Periodically prints torture statistics, if periodic statistics printing
- * was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
- */
-static int
-rcu_torture_stats(void *arg)
-{
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
-	do {
-		schedule_timeout_interruptible(stat_interval * HZ);
-		rcu_torture_stats_print();
-		rcutorture_shutdown_absorb("rcu_torture_stats");
-	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
-	return 0;
-}
-
-static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
-	int i;
-
-	cpumask_setall(shuffle_tmp_mask);
-	get_online_cpus();
-
-	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1) {
-		put_online_cpus();
-		return;
-	}
-
-	if (rcu_idle_cpu != -1)
-		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-
-	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++)
-			if (reader_tasks[i])
-				set_cpus_allowed_ptr(reader_tasks[i],
-						     shuffle_tmp_mask);
-	}
-	if (fakewriter_tasks) {
-		for (i = 0; i < nfakewriters; i++)
-			if (fakewriter_tasks[i])
-				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     shuffle_tmp_mask);
-	}
-	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
-	if (stutter_task)
-		set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
-	if (fqs_task)
-		set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
-	if (shutdown_task)
-		set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
-#ifdef CONFIG_HOTPLUG_CPU
-	if (onoff_task)
-		set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-	if (stall_task)
-		set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
-	if (barrier_cbs_tasks)
-		for (i = 0; i < n_barrier_cbs; i++)
-			if (barrier_cbs_tasks[i])
-				set_cpus_allowed_ptr(barrier_cbs_tasks[i],
-						     shuffle_tmp_mask);
-	if (barrier_task)
-		set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
-
-	if (rcu_idle_cpu == -1)
-		rcu_idle_cpu = num_online_cpus() - 1;
-	else
-		rcu_idle_cpu--;
-
-	put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
-	do {
-		schedule_timeout_interruptible(shuffle_interval * HZ);
-		rcu_torture_shuffle_tasks();
-		rcutorture_shutdown_absorb("rcu_torture_shuffle");
-	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
-	return 0;
-}
-
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
-	VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
-	do {
-		schedule_timeout_interruptible(stutter * HZ);
-		stutter_pause_test = 1;
-		if (!kthread_should_stop())
-			schedule_timeout_interruptible(stutter * HZ);
-		stutter_pause_test = 0;
-		rcutorture_shutdown_absorb("rcu_torture_stutter");
-	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
-	return 0;
-}
-
-static inline void
-rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
-{
-	pr_alert("%s" TORTURE_FLAG
-		 "--- %s: nreaders=%d nfakewriters=%d "
-		 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
-		 "shuffle_interval=%d stutter=%d irqreader=%d "
-		 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
-		 "test_boost=%d/%d test_boost_interval=%d "
-		 "test_boost_duration=%d shutdown_secs=%d "
-		 "stall_cpu=%d stall_cpu_holdoff=%d "
-		 "n_barrier_cbs=%d "
-		 "onoff_interval=%d onoff_holdoff=%d\n",
-		 torture_type, tag, nrealreaders, nfakewriters,
-		 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
-		 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
-		 test_boost, cur_ops->can_boost,
-		 test_boost_interval, test_boost_duration, shutdown_secs,
-		 stall_cpu, stall_cpu_holdoff,
-		 n_barrier_cbs,
-		 onoff_interval, onoff_holdoff);
-}
-
-static struct notifier_block rcutorture_shutdown_nb = {
-	.notifier_call = rcutorture_shutdown_notify,
-};
-
-static void rcutorture_booster_cleanup(int cpu)
-{
-	struct task_struct *t;
-
-	if (boost_tasks[cpu] == NULL)
-		return;
-	mutex_lock(&boost_mutex);
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
-	t = boost_tasks[cpu];
-	boost_tasks[cpu] = NULL;
-	mutex_unlock(&boost_mutex);
-
-	/* This must be outside of the mutex, otherwise deadlock! */
-	kthread_stop(t);
-	boost_tasks[cpu] = NULL;
-}
-
-static int rcutorture_booster_init(int cpu)
-{
-	int retval;
-
-	if (boost_tasks[cpu] != NULL)
-		return 0;  /* Already created, nothing more to do. */
-
-	/* Don't allow time recalculation while creating a new task. */
-	mutex_lock(&boost_mutex);
-	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
-						  cpu_to_node(cpu),
-						  "rcu_torture_boost");
-	if (IS_ERR(boost_tasks[cpu])) {
-		retval = PTR_ERR(boost_tasks[cpu]);
-		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
-		n_rcu_torture_boost_ktrerror++;
-		boost_tasks[cpu] = NULL;
-		mutex_unlock(&boost_mutex);
-		return retval;
-	}
-	kthread_bind(boost_tasks[cpu], cpu);
-	wake_up_process(boost_tasks[cpu]);
-	mutex_unlock(&boost_mutex);
-	return 0;
-}
-
-/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
-	long delta;
-	unsigned long jiffies_snap;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
-	jiffies_snap = ACCESS_ONCE(jiffies);
-	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
-	       !kthread_should_stop()) {
-		delta = shutdown_time - jiffies_snap;
-		if (verbose)
-			pr_alert("%s" TORTURE_FLAG
-				 "rcu_torture_shutdown task: %lu jiffies remaining\n",
-				 torture_type, delta);
-		schedule_timeout_interruptible(delta);
-		jiffies_snap = ACCESS_ONCE(jiffies);
-	}
-	if (kthread_should_stop()) {
-		VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
-		return 0;
-	}
-
-	/* OK, shut down the system. */
-
-	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
-	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
-	rcu_torture_cleanup();	/* Get the success/failure message. */
-	kernel_power_off();	/* Shut down the system. */
-	return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int
-rcu_torture_onoff(void *arg)
-{
-	int cpu;
-	unsigned long delta;
-	int maxcpu = -1;
-	DEFINE_RCU_RANDOM(rand);
-	int ret;
-	unsigned long starttime;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
-	for_each_online_cpu(cpu)
-		maxcpu = cpu;
-	WARN_ON(maxcpu < 0);
-	if (onoff_holdoff > 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
-		schedule_timeout_interruptible(onoff_holdoff * HZ);
-		VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
-	}
-	while (!kthread_should_stop()) {
-		cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
-		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "rcu_torture_onoff task: offlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_offline_attempts++;
-			ret = cpu_down(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: offline %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: offlined %d\n",
-						 torture_type, cpu);
-				n_offline_successes++;
-				delta = jiffies - starttime;
-				sum_offline += delta;
-				if (min_offline < 0) {
-					min_offline = delta;
-					max_offline = delta;
-				}
-				if (min_offline > delta)
-					min_offline = delta;
-				if (max_offline < delta)
-					max_offline = delta;
-			}
-		} else if (cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "rcu_torture_onoff task: onlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_online_attempts++;
-			ret = cpu_up(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: online %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: onlined %d\n",
-						 torture_type, cpu);
-				n_online_successes++;
-				delta = jiffies - starttime;
-				sum_online += delta;
-				if (min_online < 0) {
-					min_online = delta;
-					max_online = delta;
-				}
-				if (min_online > delta)
-					min_online = delta;
-				if (max_online < delta)
-					max_online = delta;
-			}
-		}
-		schedule_timeout_interruptible(onoff_interval * HZ);
-	}
-	VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
-	return 0;
-}
-
-static int
-rcu_torture_onoff_init(void)
-{
-	int ret;
-
-	if (onoff_interval <= 0)
-		return 0;
-	onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
-	if (IS_ERR(onoff_task)) {
-		ret = PTR_ERR(onoff_task);
-		onoff_task = NULL;
-		return ret;
-	}
-	return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-	if (onoff_task == NULL)
-		return;
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
-	kthread_stop(onoff_task);
-	onoff_task = NULL;
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static int
-rcu_torture_onoff_init(void)
-{
-	return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
- * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
- * induces a CPU stall for the time specified by stall_cpu.
- */
-static int rcu_torture_stall(void *args)
-{
-	unsigned long stop_at;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
-	if (stall_cpu_holdoff > 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
-		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
-		VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
-	}
-	if (!kthread_should_stop()) {
-		stop_at = get_seconds() + stall_cpu;
-		/* RCU CPU stall is expected behavior in following code. */
-		pr_alert("rcu_torture_stall start.\n");
-		rcu_read_lock();
-		preempt_disable();
-		while (ULONG_CMP_LT(get_seconds(), stop_at))
-			continue;  /* Induce RCU CPU stall warning. */
-		preempt_enable();
-		rcu_read_unlock();
-		pr_alert("rcu_torture_stall end.\n");
-	}
-	rcutorture_shutdown_absorb("rcu_torture_stall");
-	while (!kthread_should_stop())
-		schedule_timeout_interruptible(10 * HZ);
-	return 0;
-}
-
-/* Spawn CPU-stall kthread, if stall_cpu specified. */
-static int __init rcu_torture_stall_init(void)
-{
-	int ret;
-
-	if (stall_cpu <= 0)
-		return 0;
-	stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
-	if (IS_ERR(stall_task)) {
-		ret = PTR_ERR(stall_task);
-		stall_task = NULL;
-		return ret;
-	}
-	return 0;
-}
-
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
-	if (stall_task == NULL)
-		return;
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
-	kthread_stop(stall_task);
-	stall_task = NULL;
-}
-
-/* Callback function for RCU barrier testing. */
-void rcu_torture_barrier_cbf(struct rcu_head *rcu)
-{
-	atomic_inc(&barrier_cbs_invoked);
-}
-
-/* kthread function to register callbacks used to test RCU barriers. */
-static int rcu_torture_barrier_cbs(void *arg)
-{
-	long myid = (long)arg;
-	bool lastphase = 0;
-	bool newphase;
-	struct rcu_head rcu;
-
-	init_rcu_head_on_stack(&rcu);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
-	set_user_nice(current, 19);
-	do {
-		wait_event(barrier_cbs_wq[myid],
-			   (newphase =
-			    ACCESS_ONCE(barrier_phase)) != lastphase ||
-			   kthread_should_stop() ||
-			   fullstop != FULLSTOP_DONTSTOP);
-		lastphase = newphase;
-		smp_mb(); /* ensure barrier_phase load before ->call(). */
-		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
-			break;
-		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
-		if (atomic_dec_and_test(&barrier_cbs_count))
-			wake_up(&barrier_wq);
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
-	while (!kthread_should_stop())
-		schedule_timeout_interruptible(1);
-	cur_ops->cb_barrier();
-	destroy_rcu_head_on_stack(&rcu);
-	return 0;
-}
-
-/* kthread function to drive and coordinate RCU barrier testing. */
-static int rcu_torture_barrier(void *arg)
-{
-	int i;
-
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
-	do {
-		atomic_set(&barrier_cbs_invoked, 0);
-		atomic_set(&barrier_cbs_count, n_barrier_cbs);
-		smp_mb(); /* Ensure barrier_phase after prior assignments. */
-		barrier_phase = !barrier_phase;
-		for (i = 0; i < n_barrier_cbs; i++)
-			wake_up(&barrier_cbs_wq[i]);
-		wait_event(barrier_wq,
-			   atomic_read(&barrier_cbs_count) == 0 ||
-			   kthread_should_stop() ||
-			   fullstop != FULLSTOP_DONTSTOP);
-		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
-			break;
-		n_barrier_attempts++;
-		cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
-		if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) {
-			n_rcu_torture_barrier_error++;
-			WARN_ON_ONCE(1);
-		}
-		n_barrier_successes++;
-		schedule_timeout_interruptible(HZ / 10);
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_barrier");
-	while (!kthread_should_stop())
-		schedule_timeout_interruptible(1);
-	return 0;
-}
-
-/* Initialize RCU barrier testing. */
-static int rcu_torture_barrier_init(void)
-{
-	int i;
-	int ret;
-
-	if (n_barrier_cbs == 0)
-		return 0;
-	if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
-		pr_alert("%s" TORTURE_FLAG
-			 " Call or barrier ops missing for %s,\n",
-			 torture_type, cur_ops->name);
-		pr_alert("%s" TORTURE_FLAG
-			 " RCU barrier testing omitted from run.\n",
-			 torture_type);
-		return 0;
-	}
-	atomic_set(&barrier_cbs_count, 0);
-	atomic_set(&barrier_cbs_invoked, 0);
-	barrier_cbs_tasks =
-		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]),
-			GFP_KERNEL);
-	barrier_cbs_wq =
-		kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]),
-			GFP_KERNEL);
-	if (barrier_cbs_tasks == NULL || !barrier_cbs_wq)
-		return -ENOMEM;
-	for (i = 0; i < n_barrier_cbs; i++) {
-		init_waitqueue_head(&barrier_cbs_wq[i]);
-		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
-						   (void *)(long)i,
-						   "rcu_torture_barrier_cbs");
-		if (IS_ERR(barrier_cbs_tasks[i])) {
-			ret = PTR_ERR(barrier_cbs_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
-			barrier_cbs_tasks[i] = NULL;
-			return ret;
-		}
-	}
-	barrier_task = kthread_run(rcu_torture_barrier, NULL,
-				   "rcu_torture_barrier");
-	if (IS_ERR(barrier_task)) {
-		ret = PTR_ERR(barrier_task);
-		VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
-		barrier_task = NULL;
-	}
-	return 0;
-}
-
-/* Clean up after RCU barrier testing. */
-static void rcu_torture_barrier_cleanup(void)
-{
-	int i;
-
-	if (barrier_task != NULL) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
-		kthread_stop(barrier_task);
-		barrier_task = NULL;
-	}
-	if (barrier_cbs_tasks != NULL) {
-		for (i = 0; i < n_barrier_cbs; i++) {
-			if (barrier_cbs_tasks[i] != NULL) {
-				VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
-				kthread_stop(barrier_cbs_tasks[i]);
-				barrier_cbs_tasks[i] = NULL;
-			}
-		}
-		kfree(barrier_cbs_tasks);
-		barrier_cbs_tasks = NULL;
-	}
-	if (barrier_cbs_wq != NULL) {
-		kfree(barrier_cbs_wq);
-		barrier_cbs_wq = NULL;
-	}
-}
-
-static int rcutorture_cpu_notify(struct notifier_block *self,
-				 unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		(void)rcutorture_booster_init(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		rcutorture_booster_cleanup(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block rcutorture_cpu_nb = {
-	.notifier_call = rcutorture_cpu_notify,
-};
-
-static void
-rcu_torture_cleanup(void)
-{
-	int i;
-
-	mutex_lock(&fullstop_mutex);
-	rcutorture_record_test_transition();
-	if (fullstop == FULLSTOP_SHUTDOWN) {
-		pr_warn(/* but going down anyway, so... */
-		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-		mutex_unlock(&fullstop_mutex);
-		schedule_timeout_uninterruptible(10);
-		if (cur_ops->cb_barrier != NULL)
-			cur_ops->cb_barrier();
-		return;
-	}
-	fullstop = FULLSTOP_RMMOD;
-	mutex_unlock(&fullstop_mutex);
-	unregister_reboot_notifier(&rcutorture_shutdown_nb);
-	rcu_torture_barrier_cleanup();
-	rcu_torture_stall_cleanup();
-	if (stutter_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
-		kthread_stop(stutter_task);
-	}
-	stutter_task = NULL;
-	if (shuffler_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
-		kthread_stop(shuffler_task);
-		free_cpumask_var(shuffle_tmp_mask);
-	}
-	shuffler_task = NULL;
-
-	if (writer_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
-		kthread_stop(writer_task);
-	}
-	writer_task = NULL;
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++) {
-			if (reader_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
-					"Stopping rcu_torture_reader task");
-				kthread_stop(reader_tasks[i]);
-			}
-			reader_tasks[i] = NULL;
-		}
-		kfree(reader_tasks);
-		reader_tasks = NULL;
-	}
-	rcu_torture_current = NULL;
-
-	if (fakewriter_tasks) {
-		for (i = 0; i < nfakewriters; i++) {
-			if (fakewriter_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
-					"Stopping rcu_torture_fakewriter task");
-				kthread_stop(fakewriter_tasks[i]);
-			}
-			fakewriter_tasks[i] = NULL;
-		}
-		kfree(fakewriter_tasks);
-		fakewriter_tasks = NULL;
-	}
-
-	if (stats_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
-		kthread_stop(stats_task);
-	}
-	stats_task = NULL;
-
-	if (fqs_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
-		kthread_stop(fqs_task);
-	}
-	fqs_task = NULL;
-	if ((test_boost == 1 && cur_ops->can_boost) ||
-	    test_boost == 2) {
-		unregister_cpu_notifier(&rcutorture_cpu_nb);
-		for_each_possible_cpu(i)
-			rcutorture_booster_cleanup(i);
-	}
-	if (shutdown_task != NULL) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
-		kthread_stop(shutdown_task);
-	}
-	shutdown_task = NULL;
-	rcu_torture_onoff_cleanup();
-
-	/* Wait for all RCU callbacks to fire.  */
-
-	if (cur_ops->cb_barrier != NULL)
-		cur_ops->cb_barrier();
-
-	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-
-	if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
-		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
-	else if (n_online_successes != n_online_attempts ||
-		 n_offline_successes != n_offline_attempts)
-		rcu_torture_print_module_parms(cur_ops,
-					       "End of test: RCU_HOTPLUG");
-	else
-		rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
-}
-
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-static void rcu_torture_leak_cb(struct rcu_head *rhp)
-{
-}
-
-static void rcu_torture_err_cb(struct rcu_head *rhp)
-{
-	/*
-	 * This -might- happen due to race conditions, but is unlikely.
-	 * The scenario that leads to this happening is that the
-	 * first of the pair of duplicate callbacks is queued,
-	 * someone else starts a grace period that includes that
-	 * callback, then the second of the pair must wait for the
-	 * next grace period.  Unlikely, but can happen.  If it
-	 * does happen, the debug-objects subsystem won't have splatted.
-	 */
-	pr_alert("rcutorture: duplicated callback was invoked.\n");
-}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-
-/*
- * Verify that double-free causes debug-objects to complain, but only
- * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.  Otherwise, say that the test
- * cannot be carried out.
- */
-static void rcu_test_debug_objects(void)
-{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-	struct rcu_head rh1;
-	struct rcu_head rh2;
-
-	init_rcu_head_on_stack(&rh1);
-	init_rcu_head_on_stack(&rh2);
-	pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n");
-
-	/* Try to queue the rh2 pair of callbacks for the same grace period. */
-	preempt_disable(); /* Prevent preemption from interrupting test. */
-	rcu_read_lock(); /* Make it impossible to finish a grace period. */
-	call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */
-	local_irq_disable(); /* Make it harder to start a new grace period. */
-	call_rcu(&rh2, rcu_torture_leak_cb);
-	call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
-	local_irq_enable();
-	rcu_read_unlock();
-	preempt_enable();
-
-	/* Wait for them all to get done so we can safely return. */
-	rcu_barrier();
-	pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n");
-	destroy_rcu_head_on_stack(&rh1);
-	destroy_rcu_head_on_stack(&rh2);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-	pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n");
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-}
-
-static int __init
-rcu_torture_init(void)
-{
-	int i;
-	int cpu;
-	int firsterr = 0;
-	int retval;
-	static struct rcu_torture_ops *torture_ops[] = {
-		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
-	};
-
-	mutex_lock(&fullstop_mutex);
-
-	/* Process args and tell the world that the torturer is on the job. */
-	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
-		cur_ops = torture_ops[i];
-		if (strcmp(torture_type, cur_ops->name) == 0)
-			break;
-	}
-	if (i == ARRAY_SIZE(torture_ops)) {
-		pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
-			 torture_type);
-		pr_alert("rcu-torture types:");
-		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
-			pr_alert(" %s", torture_ops[i]->name);
-		pr_alert("\n");
-		mutex_unlock(&fullstop_mutex);
-		return -EINVAL;
-	}
-	if (cur_ops->fqs == NULL && fqs_duration != 0) {
-		pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
-		fqs_duration = 0;
-	}
-	if (cur_ops->init)
-		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
-
-	if (nreaders >= 0)
-		nrealreaders = nreaders;
-	else
-		nrealreaders = 2 * num_online_cpus();
-	rcu_torture_print_module_parms(cur_ops, "Start of test");
-	fullstop = FULLSTOP_DONTSTOP;
-
-	/* Set up the freelist. */
-
-	INIT_LIST_HEAD(&rcu_torture_freelist);
-	for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
-		rcu_tortures[i].rtort_mbtest = 0;
-		list_add_tail(&rcu_tortures[i].rtort_free,
-			      &rcu_torture_freelist);
-	}
-
-	/* Initialize the statistics so that each run gets its own numbers. */
-
-	rcu_torture_current = NULL;
-	rcu_torture_current_version = 0;
-	atomic_set(&n_rcu_torture_alloc, 0);
-	atomic_set(&n_rcu_torture_alloc_fail, 0);
-	atomic_set(&n_rcu_torture_free, 0);
-	atomic_set(&n_rcu_torture_mberror, 0);
-	atomic_set(&n_rcu_torture_error, 0);
-	n_rcu_torture_barrier_error = 0;
-	n_rcu_torture_boost_ktrerror = 0;
-	n_rcu_torture_boost_rterror = 0;
-	n_rcu_torture_boost_failure = 0;
-	n_rcu_torture_boosts = 0;
-	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
-		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_possible_cpu(cpu) {
-		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
-			per_cpu(rcu_torture_count, cpu)[i] = 0;
-			per_cpu(rcu_torture_batch, cpu)[i] = 0;
-		}
-	}
-
-	/* Start up the kthreads. */
-
-	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
-	writer_task = kthread_create(rcu_torture_writer, NULL,
-				     "rcu_torture_writer");
-	if (IS_ERR(writer_task)) {
-		firsterr = PTR_ERR(writer_task);
-		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
-		writer_task = NULL;
-		goto unwind;
-	}
-	wake_up_process(writer_task);
-	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
-				   GFP_KERNEL);
-	if (fakewriter_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nfakewriters; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
-		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
-						  "rcu_torture_fakewriter");
-		if (IS_ERR(fakewriter_tasks[i])) {
-			firsterr = PTR_ERR(fakewriter_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
-			fakewriter_tasks[i] = NULL;
-			goto unwind;
-		}
-	}
-	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	if (reader_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nrealreaders; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
-		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
-					      "rcu_torture_reader");
-		if (IS_ERR(reader_tasks[i])) {
-			firsterr = PTR_ERR(reader_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
-			reader_tasks[i] = NULL;
-			goto unwind;
-		}
-	}
-	if (stat_interval > 0) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
-		stats_task = kthread_run(rcu_torture_stats, NULL,
-					"rcu_torture_stats");
-		if (IS_ERR(stats_task)) {
-			firsterr = PTR_ERR(stats_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
-			stats_task = NULL;
-			goto unwind;
-		}
-	}
-	if (test_no_idle_hz) {
-		rcu_idle_cpu = num_online_cpus() - 1;
-
-		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
-			firsterr = -ENOMEM;
-			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
-			goto unwind;
-		}
-
-		/* Create the shuffler thread */
-		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
-					  "rcu_torture_shuffle");
-		if (IS_ERR(shuffler_task)) {
-			free_cpumask_var(shuffle_tmp_mask);
-			firsterr = PTR_ERR(shuffler_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
-			shuffler_task = NULL;
-			goto unwind;
-		}
-	}
-	if (stutter < 0)
-		stutter = 0;
-	if (stutter) {
-		/* Create the stutter thread */
-		stutter_task = kthread_run(rcu_torture_stutter, NULL,
-					  "rcu_torture_stutter");
-		if (IS_ERR(stutter_task)) {
-			firsterr = PTR_ERR(stutter_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
-			stutter_task = NULL;
-			goto unwind;
-		}
-	}
-	if (fqs_duration < 0)
-		fqs_duration = 0;
-	if (fqs_duration) {
-		/* Create the stutter thread */
-		fqs_task = kthread_run(rcu_torture_fqs, NULL,
-				       "rcu_torture_fqs");
-		if (IS_ERR(fqs_task)) {
-			firsterr = PTR_ERR(fqs_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
-			fqs_task = NULL;
-			goto unwind;
-		}
-	}
-	if (test_boost_interval < 1)
-		test_boost_interval = 1;
-	if (test_boost_duration < 2)
-		test_boost_duration = 2;
-	if ((test_boost == 1 && cur_ops->can_boost) ||
-	    test_boost == 2) {
-
-		boost_starttime = jiffies + test_boost_interval * HZ;
-		register_cpu_notifier(&rcutorture_cpu_nb);
-		for_each_possible_cpu(i) {
-			if (cpu_is_offline(i))
-				continue;  /* Heuristic: CPU can go offline. */
-			retval = rcutorture_booster_init(i);
-			if (retval < 0) {
-				firsterr = retval;
-				goto unwind;
-			}
-		}
-	}
-	if (shutdown_secs > 0) {
-		shutdown_time = jiffies + shutdown_secs * HZ;
-		shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
-					       "rcu_torture_shutdown");
-		if (IS_ERR(shutdown_task)) {
-			firsterr = PTR_ERR(shutdown_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
-			shutdown_task = NULL;
-			goto unwind;
-		}
-		wake_up_process(shutdown_task);
-	}
-	i = rcu_torture_onoff_init();
-	if (i != 0) {
-		firsterr = i;
-		goto unwind;
-	}
-	register_reboot_notifier(&rcutorture_shutdown_nb);
-	i = rcu_torture_stall_init();
-	if (i != 0) {
-		firsterr = i;
-		goto unwind;
-	}
-	retval = rcu_torture_barrier_init();
-	if (retval != 0) {
-		firsterr = retval;
-		goto unwind;
-	}
-	if (object_debug)
-		rcu_test_debug_objects();
-	rcutorture_record_test_transition();
-	mutex_unlock(&fullstop_mutex);
-	return 0;
-
-unwind:
-	mutex_unlock(&fullstop_mutex);
-	rcu_torture_cleanup();
-	return firsterr;
-}
-
-module_init(rcu_torture_init);
-module_exit(rcu_torture_cleanup);
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 000000000000..c82c70f7828e
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,71 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ *	Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+#define TORTURE_RANDOM_MULT	39916801  /* prime */
+#define TORTURE_RANDOM_ADD	479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH	10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+	if (--trsp->trs_count < 0) {
+		trsp->trs_state += (unsigned long)local_clock();
+		trsp->trs_count = TORTURE_RANDOM_REFRESH;
+	}
+	trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+		TORTURE_RANDOM_ADD;
+	return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a48abeac753f..2bfb4e5cdf8c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1141,9 +1141,14 @@ config SPARSE_RCU_POINTER
 
 	 Say N if you are unsure.
 
+config TORTURE_TEST
+	tristate
+	default n
+
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
+	select TORTURE_TEST
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
-- 
cgit v1.3-14-g43fede


From 9e2502254132261e0ea8010692fd447b1cedf627 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 27 Jan 2014 16:27:00 -0800
Subject: rcutorture: Abstract torture_param()

Create a torture_param() macro and apply it to rcutorture in order to
save a few lines of code.  This same macro may be applied to other
torture frameworks.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h |   9 ++++-
 kernel/rcu/rcutorture.c | 103 ++++++++++++++++--------------------------------
 2 files changed, 41 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 979e3e6b378a..5aae880b6b4e 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -35,13 +35,18 @@
 #include <linux/bug.h>
 #include <linux/compiler.h>
 
+/* Definitions for a non-string torture-test module parameter. */
+#define torture_param(type, name, init, msg) \
+	static type name = init; \
+	module_param(name, type, 0444); \
+	MODULE_PARM_DESC(name, msg);
+
+/* Low-rider random number generator. */
 struct torture_random_state {
 	unsigned long trs_state;
 	long trs_count;
 };
-
 #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 }
-
 unsigned long torture_random(struct torture_random_state *trsp);
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 94b1cd8b214c..930791e0698d 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -59,78 +59,43 @@ MODULE_ALIAS("rcutorture");
 #endif
 #define MODULE_PARAM_PREFIX "rcutorture."
 
-static int fqs_duration;
-module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
-static int fqs_holdoff;
-module_param(fqs_holdoff, int, 0444);
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
-static int fqs_stutter = 3;
-module_param(fqs_stutter, int, 0444);
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-static bool gp_exp;
-module_param(gp_exp, bool, 0444);
-MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
-static bool gp_normal;
-module_param(gp_normal, bool, 0444);
-MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
-static int irqreader = 1;
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
-static int n_barrier_cbs;
-module_param(n_barrier_cbs, int, 0444);
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-static int nfakewriters = 4;
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-static int nreaders = -1;
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-static int object_debug;
-module_param(object_debug, int, 0444);
-MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
-static int onoff_holdoff;
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-static int onoff_interval;
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-static int shuffle_interval = 3;
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-static int shutdown_secs;
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
-static int stall_cpu;
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-static int stall_cpu_holdoff = 10;
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
-static int stat_interval = 60;
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-static int stutter = 5;
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-static int test_boost = 1;
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int test_boost_duration = 4;
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-static int test_boost_interval = 7;
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-static bool test_no_idle_hz = true;
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+torture_param(int, fqs_duration, 0,
+	      "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+	     "Use normal (non-expedited) GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+	     "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+	     "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+	     "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+	     "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+	     "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+	     "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+	     "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+	     "Test support for tickless idle CPUs");
+torture_param(bool, verbose, false, "Enable verbose debugging printk()s");
+
 static char *torture_type = "rcu";
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-static bool verbose;
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 
 #define TORTURE_FLAG "-torture:"
 #define PRINTK_STRING(s) \
-- 
cgit v1.3-14-g43fede


From 5ccf60f23d33afd53568cff4f3f421f2ca624401 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Jan 2014 07:25:25 -0800
Subject: rcutorture: Rename PRINTK to TOROUT

Since it doesn't do printk()s anymore anyway, this commit renames these
macros from PRINTK to TOROUT (short for torture output).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 134 ++++++++++++++++++++++++------------------------
 1 file changed, 67 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 930791e0698d..34a75b1170e8 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -98,11 +98,11 @@ module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
 
 #define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
+#define TOROUT_STRING(s) \
 	do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
+#define VERBOSE_TOROUT_STRING(s) \
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
+#define VERBOSE_TOROUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 
 static int nrealreaders;
@@ -619,12 +619,12 @@ static int rcu_torture_boost(void *arg)
 	struct rcu_boost_inflight rbi = { .inflight = 0 };
 	struct sched_param sp;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+	VERBOSE_TOROUT_STRING("rcu_torture_boost started");
 
 	/* Set real-time priority. */
 	sp.sched_priority = 1;
 	if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+		VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
 		n_rcu_torture_boost_rterror++;
 	}
 
@@ -652,7 +652,7 @@ static int rcu_torture_boost(void *arg)
 				call_rcu(&rbi.rcu, rcu_torture_boost_cb);
 				if (jiffies - call_rcu_time >
 					 test_boost_duration * HZ - HZ / 2) {
-					VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+					VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
 					n_rcu_torture_boost_failure++;
 				}
 				call_rcu_time = jiffies;
@@ -688,7 +688,7 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
 	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
 
 	/* Clean up and exit. */
-	VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_boost");
 	while (!kthread_should_stop() || rbi.inflight)
 		schedule_timeout_uninterruptible(1);
@@ -708,7 +708,7 @@ rcu_torture_fqs(void *arg)
 	unsigned long fqs_resume_time;
 	int fqs_burst_remaining;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
 	do {
 		fqs_resume_time = jiffies + fqs_stutter * HZ;
 		while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -724,7 +724,7 @@ rcu_torture_fqs(void *arg)
 		}
 		rcu_stutter_wait("rcu_torture_fqs");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_fqs");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
@@ -746,7 +746,7 @@ rcu_torture_writer(void *arg)
 	struct rcu_torture *old_rp;
 	static DEFINE_TORTURE_RANDOM(rand);
 
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
 	set_user_nice(current, 19);
 
 	do {
@@ -796,7 +796,7 @@ rcu_torture_writer(void *arg)
 		rcutorture_record_progress(++rcu_torture_current_version);
 		rcu_stutter_wait("rcu_torture_writer");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_writer");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
@@ -812,7 +812,7 @@ rcu_torture_fakewriter(void *arg)
 {
 	DEFINE_TORTURE_RANDOM(rand);
 
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
 	set_user_nice(current, 19);
 
 	do {
@@ -834,7 +834,7 @@ rcu_torture_fakewriter(void *arg)
 		rcu_stutter_wait("rcu_torture_fakewriter");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 
-	VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
@@ -928,7 +928,7 @@ rcu_torture_reader(void *arg)
 	struct timer_list t;
 	unsigned long long ts;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
 	set_user_nice(current, 19);
 	if (irqreader && cur_ops->irq_capable)
 		setup_timer_on_stack(&t, rcu_torture_timer, 0);
@@ -978,7 +978,7 @@ rcu_torture_reader(void *arg)
 		schedule();
 		rcu_stutter_wait("rcu_torture_reader");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
@@ -1099,13 +1099,13 @@ rcu_torture_stats_print(void)
 static int
 rcu_torture_stats(void *arg)
 {
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
 		rcutorture_shutdown_absorb("rcu_torture_stats");
 	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
 
@@ -1183,13 +1183,13 @@ static void rcu_torture_shuffle_tasks(void)
 static int
 rcu_torture_shuffle(void *arg)
 {
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_shuffle task started");
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
 		rcutorture_shutdown_absorb("rcu_torture_shuffle");
 	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping");
 	return 0;
 }
 
@@ -1199,7 +1199,7 @@ rcu_torture_shuffle(void *arg)
 static int
 rcu_torture_stutter(void *arg)
 {
-	VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_stutter task started");
 	do {
 		schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 1;
@@ -1208,7 +1208,7 @@ rcu_torture_stutter(void *arg)
 		stutter_pause_test = 0;
 		rcutorture_shutdown_absorb("rcu_torture_stutter");
 	} while (!kthread_should_stop());
-	VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping");
 	return 0;
 }
 
@@ -1246,7 +1246,7 @@ static void rcutorture_booster_cleanup(int cpu)
 	if (boost_tasks[cpu] == NULL)
 		return;
 	mutex_lock(&boost_mutex);
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
+	VERBOSE_TOROUT_STRING("Stopping rcu_torture_boost task");
 	t = boost_tasks[cpu];
 	boost_tasks[cpu] = NULL;
 	mutex_unlock(&boost_mutex);
@@ -1265,13 +1265,13 @@ static int rcutorture_booster_init(int cpu)
 
 	/* Don't allow time recalculation while creating a new task. */
 	mutex_lock(&boost_mutex);
-	VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+	VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
 	boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
 						  cpu_to_node(cpu),
 						  "rcu_torture_boost");
 	if (IS_ERR(boost_tasks[cpu])) {
 		retval = PTR_ERR(boost_tasks[cpu]);
-		VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+		VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
 		n_rcu_torture_boost_ktrerror++;
 		boost_tasks[cpu] = NULL;
 		mutex_unlock(&boost_mutex);
@@ -1293,7 +1293,7 @@ rcu_torture_shutdown(void *arg)
 	long delta;
 	unsigned long jiffies_snap;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_shutdown task started");
 	jiffies_snap = ACCESS_ONCE(jiffies);
 	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
 	       !kthread_should_stop()) {
@@ -1306,13 +1306,13 @@ rcu_torture_shutdown(void *arg)
 		jiffies_snap = ACCESS_ONCE(jiffies);
 	}
 	if (kthread_should_stop()) {
-		VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+		VERBOSE_TOROUT_STRING("rcu_torture_shutdown task stopping");
 		return 0;
 	}
 
 	/* OK, shut down the system. */
 
-	VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+	VERBOSE_TOROUT_STRING("rcu_torture_shutdown task shutting down system");
 	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
 	rcu_torture_cleanup();	/* Get the success/failure message. */
 	kernel_power_off();	/* Shut down the system. */
@@ -1335,14 +1335,14 @@ rcu_torture_onoff(void *arg)
 	int ret;
 	unsigned long starttime;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_onoff task started");
 	for_each_online_cpu(cpu)
 		maxcpu = cpu;
 	WARN_ON(maxcpu < 0);
 	if (onoff_holdoff > 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
+		VERBOSE_TOROUT_STRING("rcu_torture_onoff begin holdoff");
 		schedule_timeout_interruptible(onoff_holdoff * HZ);
-		VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
+		VERBOSE_TOROUT_STRING("rcu_torture_onoff end holdoff");
 	}
 	while (!kthread_should_stop()) {
 		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
@@ -1409,7 +1409,7 @@ rcu_torture_onoff(void *arg)
 		}
 		schedule_timeout_interruptible(onoff_interval * HZ);
 	}
-	VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_onoff task stopping");
 	return 0;
 }
 
@@ -1433,7 +1433,7 @@ static void rcu_torture_onoff_cleanup(void)
 {
 	if (onoff_task == NULL)
 		return;
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+	VERBOSE_TOROUT_STRING("Stopping rcu_torture_onoff task");
 	kthread_stop(onoff_task);
 	onoff_task = NULL;
 }
@@ -1460,11 +1460,11 @@ static int rcu_torture_stall(void *args)
 {
 	unsigned long stop_at;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
 	if (stall_cpu_holdoff > 0) {
-		VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+		VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
 		schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
-		VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+		VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
 	}
 	if (!kthread_should_stop()) {
 		stop_at = get_seconds() + stall_cpu;
@@ -1505,7 +1505,7 @@ static void rcu_torture_stall_cleanup(void)
 {
 	if (stall_task == NULL)
 		return;
-	VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
+	VERBOSE_TOROUT_STRING("Stopping rcu_torture_stall_task.");
 	kthread_stop(stall_task);
 	stall_task = NULL;
 }
@@ -1525,7 +1525,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 	struct rcu_head rcu;
 
 	init_rcu_head_on_stack(&rcu);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
+	VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
 	set_user_nice(current, 19);
 	do {
 		wait_event(barrier_cbs_wq[myid],
@@ -1541,7 +1541,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(1);
@@ -1555,7 +1555,7 @@ static int rcu_torture_barrier(void *arg)
 {
 	int i;
 
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+	VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
 	do {
 		atomic_set(&barrier_cbs_invoked, 0);
 		atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1578,7 +1578,7 @@ static int rcu_torture_barrier(void *arg)
 		n_barrier_successes++;
 		schedule_timeout_interruptible(HZ / 10);
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
-	VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
+	VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping");
 	rcutorture_shutdown_absorb("rcu_torture_barrier");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(1);
@@ -1619,7 +1619,7 @@ static int rcu_torture_barrier_init(void)
 						   "rcu_torture_barrier_cbs");
 		if (IS_ERR(barrier_cbs_tasks[i])) {
 			ret = PTR_ERR(barrier_cbs_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
 			barrier_cbs_tasks[i] = NULL;
 			return ret;
 		}
@@ -1628,7 +1628,7 @@ static int rcu_torture_barrier_init(void)
 				   "rcu_torture_barrier");
 	if (IS_ERR(barrier_task)) {
 		ret = PTR_ERR(barrier_task);
-		VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
+		VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier");
 		barrier_task = NULL;
 	}
 	return 0;
@@ -1640,14 +1640,14 @@ static void rcu_torture_barrier_cleanup(void)
 	int i;
 
 	if (barrier_task != NULL) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier task");
 		kthread_stop(barrier_task);
 		barrier_task = NULL;
 	}
 	if (barrier_cbs_tasks != NULL) {
 		for (i = 0; i < n_barrier_cbs; i++) {
 			if (barrier_cbs_tasks[i] != NULL) {
-				VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
+				VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier_cbs task");
 				kthread_stop(barrier_cbs_tasks[i]);
 				barrier_cbs_tasks[i] = NULL;
 			}
@@ -1706,19 +1706,19 @@ rcu_torture_cleanup(void)
 	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
 	if (stutter_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_stutter task");
 		kthread_stop(stutter_task);
 	}
 	stutter_task = NULL;
 	if (shuffler_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
 		free_cpumask_var(shuffle_tmp_mask);
 	}
 	shuffler_task = NULL;
 
 	if (writer_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task");
 		kthread_stop(writer_task);
 	}
 	writer_task = NULL;
@@ -1726,7 +1726,7 @@ rcu_torture_cleanup(void)
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++) {
 			if (reader_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
+				VERBOSE_TOROUT_STRING(
 					"Stopping rcu_torture_reader task");
 				kthread_stop(reader_tasks[i]);
 			}
@@ -1740,7 +1740,7 @@ rcu_torture_cleanup(void)
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {
 			if (fakewriter_tasks[i]) {
-				VERBOSE_PRINTK_STRING(
+				VERBOSE_TOROUT_STRING(
 					"Stopping rcu_torture_fakewriter task");
 				kthread_stop(fakewriter_tasks[i]);
 			}
@@ -1751,13 +1751,13 @@ rcu_torture_cleanup(void)
 	}
 
 	if (stats_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_stats task");
 		kthread_stop(stats_task);
 	}
 	stats_task = NULL;
 
 	if (fqs_task) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_fqs task");
 		kthread_stop(fqs_task);
 	}
 	fqs_task = NULL;
@@ -1768,7 +1768,7 @@ rcu_torture_cleanup(void)
 			rcutorture_booster_cleanup(i);
 	}
 	if (shutdown_task != NULL) {
-		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+		VERBOSE_TOROUT_STRING("Stopping rcu_torture_shutdown task");
 		kthread_stop(shutdown_task);
 	}
 	shutdown_task = NULL;
@@ -1924,12 +1924,12 @@ rcu_torture_init(void)
 
 	/* Start up the kthreads. */
 
-	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	VERBOSE_TOROUT_STRING("Creating rcu_torture_writer task");
 	writer_task = kthread_create(rcu_torture_writer, NULL,
 				     "rcu_torture_writer");
 	if (IS_ERR(writer_task)) {
 		firsterr = PTR_ERR(writer_task);
-		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		VERBOSE_TOROUT_ERRSTRING("Failed to create writer");
 		writer_task = NULL;
 		goto unwind;
 	}
@@ -1937,17 +1937,17 @@ rcu_torture_init(void)
 	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
 				   GFP_KERNEL);
 	if (fakewriter_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		VERBOSE_TOROUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
 	for (i = 0; i < nfakewriters; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
+		VERBOSE_TOROUT_STRING("Creating rcu_torture_fakewriter task");
 		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
 						  "rcu_torture_fakewriter");
 		if (IS_ERR(fakewriter_tasks[i])) {
 			firsterr = PTR_ERR(fakewriter_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create fakewriter");
 			fakewriter_tasks[i] = NULL;
 			goto unwind;
 		}
@@ -1955,28 +1955,28 @@ rcu_torture_init(void)
 	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (reader_tasks == NULL) {
-		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		VERBOSE_TOROUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
 	for (i = 0; i < nrealreaders; i++) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		VERBOSE_TOROUT_STRING("Creating rcu_torture_reader task");
 		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
 					      "rcu_torture_reader");
 		if (IS_ERR(reader_tasks[i])) {
 			firsterr = PTR_ERR(reader_tasks[i]);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create reader");
 			reader_tasks[i] = NULL;
 			goto unwind;
 		}
 	}
 	if (stat_interval > 0) {
-		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task");
 		stats_task = kthread_run(rcu_torture_stats, NULL,
 					"rcu_torture_stats");
 		if (IS_ERR(stats_task)) {
 			firsterr = PTR_ERR(stats_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create stats");
 			stats_task = NULL;
 			goto unwind;
 		}
@@ -1986,7 +1986,7 @@ rcu_torture_init(void)
 
 		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
 			firsterr = -ENOMEM;
-			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+			VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
 			goto unwind;
 		}
 
@@ -1996,7 +1996,7 @@ rcu_torture_init(void)
 		if (IS_ERR(shuffler_task)) {
 			free_cpumask_var(shuffle_tmp_mask);
 			firsterr = PTR_ERR(shuffler_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler");
 			shuffler_task = NULL;
 			goto unwind;
 		}
@@ -2009,7 +2009,7 @@ rcu_torture_init(void)
 					  "rcu_torture_stutter");
 		if (IS_ERR(stutter_task)) {
 			firsterr = PTR_ERR(stutter_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create stutter");
 			stutter_task = NULL;
 			goto unwind;
 		}
@@ -2022,7 +2022,7 @@ rcu_torture_init(void)
 				       "rcu_torture_fqs");
 		if (IS_ERR(fqs_task)) {
 			firsterr = PTR_ERR(fqs_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create fqs");
 			fqs_task = NULL;
 			goto unwind;
 		}
@@ -2052,7 +2052,7 @@ rcu_torture_init(void)
 					       "rcu_torture_shutdown");
 		if (IS_ERR(shutdown_task)) {
 			firsterr = PTR_ERR(shutdown_task);
-			VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+			VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown");
 			shutdown_task = NULL;
 			goto unwind;
 		}
-- 
cgit v1.3-14-g43fede


From c2884de38e01134ae040d55aa5644049d1bb850f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Jan 2014 07:30:50 -0800
Subject: rcutorture: Abstract TOROUT_STRING() and friends

These diagnostic macros are not confined to torturing RCU, so this commit
makes them available to other torture tests.  Also removed the do-while
from TOROUT_STRING() in response to checkpatch complaints.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h | 8 ++++++++
 kernel/rcu/rcutorture.c | 8 --------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 5aae880b6b4e..09be8887fb08 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -41,6 +41,14 @@
 	module_param(name, type, 0444); \
 	MODULE_PARM_DESC(name, msg);
 
+#define TORTURE_FLAG "-torture:"
+#define TOROUT_STRING(s) \
+	pr_alert("%s" TORTURE_FLAG s "\n", torture_type)
+#define VERBOSE_TOROUT_STRING(s) \
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+#define VERBOSE_TOROUT_ERRSTRING(s) \
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+
 /* Low-rider random number generator. */
 struct torture_random_state {
 	unsigned long trs_state;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 34a75b1170e8..86fd8c11257b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -97,14 +97,6 @@ static char *torture_type = "rcu";
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
 
-#define TORTURE_FLAG "-torture:"
-#define TOROUT_STRING(s) \
-	do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_TOROUT_STRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_TOROUT_ERRSTRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
-
 static int nrealreaders;
 static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
-- 
cgit v1.3-14-g43fede


From f67a33561e6e5463b548219df98130da95f2e4a7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 29 Jan 2014 07:40:27 -0800
Subject: rcutorture: Abstract torture_shutdown_absorb()

Because handling races between rmmod and normal shutdown is not specific
to rcutorture, this commit renames rcutorture_shutdown_absorb() to
torture_shutdown_absorb() and pulls it out into then kernel/torture.c
module.  This implies pulling the fullstop mechanism into kernel/torture.c
as well.

The exporting of fullstop and fullstop_mutex is ugly and must die.
And it does in fact die in later commits that introduce higher-level
APIs that encapsulate both of these variables.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>`
---
 include/linux/torture.h | 15 +++++++++++++
 kernel/rcu/rcutorture.c | 57 ++++++++++++++++---------------------------------
 kernel/torture.c        | 20 +++++++++++++++++
 3 files changed, 53 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 09be8887fb08..8474d776c864 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -41,6 +41,18 @@
 	module_param(name, type, 0444); \
 	MODULE_PARM_DESC(name, msg);
 
+/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
+#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
+extern int fullstop;
+/* Protect fullstop transitions and spawning of kthreads.  */
+extern struct mutex fullstop_mutex;
+
+/* Common module parameters. */
+extern char *torture_type;
+extern bool verbose;
+
 #define TORTURE_FLAG "-torture:"
 #define TOROUT_STRING(s) \
 	pr_alert("%s" TORTURE_FLAG s "\n", torture_type)
@@ -57,4 +69,7 @@ struct torture_random_state {
 #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 }
 unsigned long torture_random(struct torture_random_state *trsp);
 
+/* Shutdown task absorption, for when the tasks cannot safely be killed. */
+void torture_shutdown_absorb(const char *title);
+
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 86fd8c11257b..a868758a6f9c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -91,11 +91,15 @@ torture_param(int, test_boost_interval, 7,
 	     "Interval between boost tests, seconds.");
 torture_param(bool, test_no_idle_hz, true,
 	     "Test support for tickless idle CPUs");
-torture_param(bool, verbose, false, "Enable verbose debugging printk()s");
 
-static char *torture_type = "rcu";
+char *torture_type = "rcu";
+EXPORT_SYMBOL_GPL(torture_type);
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
+bool verbose;
+EXPORT_SYMBOL_GPL(verbose);
+module_param(verbose, bool, 0444);
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 
 static int nrealreaders;
 static struct task_struct *writer_task;
@@ -200,17 +204,6 @@ static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 
-/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
-
-#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-
 /* Forward reference. */
 static void rcu_torture_cleanup(void);
 
@@ -231,20 +224,6 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
 	return NOTIFY_DONE;
 }
 
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(const char *title)
-{
-	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
-		pr_notice(
-		       "rcutorture thread %s parking due to system shutdown\n",
-		       title);
-		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
-	}
-}
-
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -286,7 +265,7 @@ rcu_stutter_wait(const char *title)
 			schedule_timeout_interruptible(1);
 		else
 			schedule_timeout_interruptible(round_jiffies_relative(HZ));
-		rcutorture_shutdown_absorb(title);
+		torture_shutdown_absorb(title);
 	}
 }
 
@@ -681,7 +660,7 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
 
 	/* Clean up and exit. */
 	VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_boost");
+	torture_shutdown_absorb("rcu_torture_boost");
 	while (!kthread_should_stop() || rbi.inflight)
 		schedule_timeout_uninterruptible(1);
 	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
@@ -717,7 +696,7 @@ rcu_torture_fqs(void *arg)
 		rcu_stutter_wait("rcu_torture_fqs");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_fqs");
+	torture_shutdown_absorb("rcu_torture_fqs");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
@@ -789,7 +768,7 @@ rcu_torture_writer(void *arg)
 		rcu_stutter_wait("rcu_torture_writer");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_writer");
+	torture_shutdown_absorb("rcu_torture_writer");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
@@ -827,7 +806,7 @@ rcu_torture_fakewriter(void *arg)
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_fakewriter");
+	torture_shutdown_absorb("rcu_torture_fakewriter");
 	while (!kthread_should_stop())
 		schedule_timeout_uninterruptible(1);
 	return 0;
@@ -971,7 +950,7 @@ rcu_torture_reader(void *arg)
 		rcu_stutter_wait("rcu_torture_reader");
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_reader");
+	torture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
 	while (!kthread_should_stop())
@@ -1095,7 +1074,7 @@ rcu_torture_stats(void *arg)
 	do {
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
-		rcutorture_shutdown_absorb("rcu_torture_stats");
+		torture_shutdown_absorb("rcu_torture_stats");
 	} while (!kthread_should_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping");
 	return 0;
@@ -1179,7 +1158,7 @@ rcu_torture_shuffle(void *arg)
 	do {
 		schedule_timeout_interruptible(shuffle_interval * HZ);
 		rcu_torture_shuffle_tasks();
-		rcutorture_shutdown_absorb("rcu_torture_shuffle");
+		torture_shutdown_absorb("rcu_torture_shuffle");
 	} while (!kthread_should_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping");
 	return 0;
@@ -1198,7 +1177,7 @@ rcu_torture_stutter(void *arg)
 		if (!kthread_should_stop())
 			schedule_timeout_interruptible(stutter * HZ);
 		stutter_pause_test = 0;
-		rcutorture_shutdown_absorb("rcu_torture_stutter");
+		torture_shutdown_absorb("rcu_torture_stutter");
 	} while (!kthread_should_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping");
 	return 0;
@@ -1470,7 +1449,7 @@ static int rcu_torture_stall(void *args)
 		rcu_read_unlock();
 		pr_alert("rcu_torture_stall end.\n");
 	}
-	rcutorture_shutdown_absorb("rcu_torture_stall");
+	torture_shutdown_absorb("rcu_torture_stall");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(10 * HZ);
 	return 0;
@@ -1534,7 +1513,7 @@ static int rcu_torture_barrier_cbs(void *arg)
 			wake_up(&barrier_wq);
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
+	torture_shutdown_absorb("rcu_torture_barrier_cbs");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(1);
 	cur_ops->cb_barrier();
@@ -1571,7 +1550,7 @@ static int rcu_torture_barrier(void *arg)
 		schedule_timeout_interruptible(HZ / 10);
 	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
 	VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping");
-	rcutorture_shutdown_absorb("rcu_torture_barrier");
+	torture_shutdown_absorb("rcu_torture_barrier");
 	while (!kthread_should_stop())
 		schedule_timeout_interruptible(1);
 	return 0;
diff --git a/kernel/torture.c b/kernel/torture.c
index c82c70f7828e..f05042036ae8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -49,6 +49,11 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 
+int fullstop = FULLSTOP_RMMOD;
+EXPORT_SYMBOL_GPL(fullstop);
+DEFINE_MUTEX(fullstop_mutex);
+EXPORT_SYMBOL_GPL(fullstop_mutex);
+
 #define TORTURE_RANDOM_MULT	39916801  /* prime */
 #define TORTURE_RANDOM_ADD	479001701 /* prime */
 #define TORTURE_RANDOM_REFRESH	10000
@@ -69,3 +74,18 @@ torture_random(struct torture_random_state *trsp)
 	return swahw32(trsp->trs_state);
 }
 EXPORT_SYMBOL_GPL(torture_random);
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+	while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+		pr_notice(
+		       "torture thread %s parking due to system shutdown\n",
+		       title);
+		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+	}
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
-- 
cgit v1.3-14-g43fede


From 3808dc9fab05913060626d7f0edd0f195cb9dcab Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 28 Jan 2014 15:29:21 -0800
Subject: rcutorture: Abstract torture_shuffle()

The torture_shuffle() function forces each CPU in turn to go idle
periodically in order to check for problems interacting with per-CPU
variables and with dyntick-idle mode.  Because this sort of debugging
is not specific to RCU, this commit abstracts that functionality.
This in turn requires abstracting some additional infrastructure.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |   5 ++
 kernel/rcu/rcutorture.c | 124 +++++----------------------------------
 kernel/torture.c        | 151 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+), 109 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 8474d776c864..544a2c33c1ae 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -69,6 +69,11 @@ struct torture_random_state {
 #define DEFINE_TORTURE_RANDOM(name) struct torture_random_state name = { 0, 0 }
 unsigned long torture_random(struct torture_random_state *trsp);
 
+/* Task shuffler, which causes CPUs to occasionally go idle. */
+void torture_shuffle_task_register(struct task_struct *tp);
+int torture_shuffle_init(long shuffint);
+void torture_shuffle_cleanup(void);
+
 /* Shutdown task absorption, for when the tasks cannot safely be killed. */
 void torture_shutdown_absorb(const char *title);
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index a868758a6f9c..0380696f1844 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -106,7 +106,6 @@ static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
@@ -161,7 +160,6 @@ static int max_online;
 static long n_barrier_attempts;
 static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
 
 static int stutter_pause_test;
 
@@ -1080,90 +1078,6 @@ rcu_torture_stats(void *arg)
 	return 0;
 }
 
-static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
-	int i;
-
-	cpumask_setall(shuffle_tmp_mask);
-	get_online_cpus();
-
-	/* No point in shuffling if there is only one online CPU (ex: UP) */
-	if (num_online_cpus() == 1) {
-		put_online_cpus();
-		return;
-	}
-
-	if (rcu_idle_cpu != -1)
-		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-
-	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++)
-			if (reader_tasks[i])
-				set_cpus_allowed_ptr(reader_tasks[i],
-						     shuffle_tmp_mask);
-	}
-	if (fakewriter_tasks) {
-		for (i = 0; i < nfakewriters; i++)
-			if (fakewriter_tasks[i])
-				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     shuffle_tmp_mask);
-	}
-	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
-	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
-	if (stutter_task)
-		set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
-	if (fqs_task)
-		set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
-	if (shutdown_task)
-		set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
-#ifdef CONFIG_HOTPLUG_CPU
-	if (onoff_task)
-		set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-	if (stall_task)
-		set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
-	if (barrier_cbs_tasks)
-		for (i = 0; i < n_barrier_cbs; i++)
-			if (barrier_cbs_tasks[i])
-				set_cpus_allowed_ptr(barrier_cbs_tasks[i],
-						     shuffle_tmp_mask);
-	if (barrier_task)
-		set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
-
-	if (rcu_idle_cpu == -1)
-		rcu_idle_cpu = num_online_cpus() - 1;
-	else
-		rcu_idle_cpu--;
-
-	put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
-	VERBOSE_TOROUT_STRING("rcu_torture_shuffle task started");
-	do {
-		schedule_timeout_interruptible(shuffle_interval * HZ);
-		rcu_torture_shuffle_tasks();
-		torture_shutdown_absorb("rcu_torture_shuffle");
-	} while (!kthread_should_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_shuffle task stopping");
-	return 0;
-}
-
 /* Cause the rcutorture test to "stutter", starting and stopping all
  * threads periodically.
  */
@@ -1397,6 +1311,7 @@ rcu_torture_onoff_init(void)
 		onoff_task = NULL;
 		return ret;
 	}
+	torture_shuffle_task_register(onoff_task);
 	return 0;
 }
 
@@ -1468,6 +1383,7 @@ static int __init rcu_torture_stall_init(void)
 		stall_task = NULL;
 		return ret;
 	}
+	torture_shuffle_task_register(stall_task);
 	return 0;
 }
 
@@ -1594,6 +1510,7 @@ static int rcu_torture_barrier_init(void)
 			barrier_cbs_tasks[i] = NULL;
 			return ret;
 		}
+		torture_shuffle_task_register(barrier_cbs_tasks[i]);
 	}
 	barrier_task = kthread_run(rcu_torture_barrier, NULL,
 				   "rcu_torture_barrier");
@@ -1602,6 +1519,7 @@ static int rcu_torture_barrier_init(void)
 		VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier");
 		barrier_task = NULL;
 	}
+	torture_shuffle_task_register(barrier_task);
 	return 0;
 }
 
@@ -1674,6 +1592,8 @@ rcu_torture_cleanup(void)
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&rcutorture_shutdown_nb);
+
+	torture_shuffle_cleanup(); /* Must be first task cleaned up. */
 	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
 	if (stutter_task) {
@@ -1681,12 +1601,6 @@ rcu_torture_cleanup(void)
 		kthread_stop(stutter_task);
 	}
 	stutter_task = NULL;
-	if (shuffler_task) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_shuffle task");
-		kthread_stop(shuffler_task);
-		free_cpumask_var(shuffle_tmp_mask);
-	}
-	shuffler_task = NULL;
 
 	if (writer_task) {
 		VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task");
@@ -1904,6 +1818,7 @@ rcu_torture_init(void)
 		writer_task = NULL;
 		goto unwind;
 	}
+	torture_shuffle_task_register(writer_task);
 	wake_up_process(writer_task);
 	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
 				   GFP_KERNEL);
@@ -1922,6 +1837,7 @@ rcu_torture_init(void)
 			fakewriter_tasks[i] = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(fakewriter_tasks[i]);
 	}
 	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
@@ -1940,6 +1856,7 @@ rcu_torture_init(void)
 			reader_tasks[i] = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(reader_tasks[i]);
 	}
 	if (stat_interval > 0) {
 		VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task");
@@ -1951,26 +1868,12 @@ rcu_torture_init(void)
 			stats_task = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(stats_task);
 	}
 	if (test_no_idle_hz) {
-		rcu_idle_cpu = num_online_cpus() - 1;
-
-		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
-			firsterr = -ENOMEM;
-			VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+		firsterr = torture_shuffle_init(shuffle_interval * HZ);
+		if (firsterr)
 			goto unwind;
-		}
-
-		/* Create the shuffler thread */
-		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
-					  "rcu_torture_shuffle");
-		if (IS_ERR(shuffler_task)) {
-			free_cpumask_var(shuffle_tmp_mask);
-			firsterr = PTR_ERR(shuffler_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler");
-			shuffler_task = NULL;
-			goto unwind;
-		}
 	}
 	if (stutter < 0)
 		stutter = 0;
@@ -1984,6 +1887,7 @@ rcu_torture_init(void)
 			stutter_task = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(stutter_task);
 	}
 	if (fqs_duration < 0)
 		fqs_duration = 0;
@@ -1997,6 +1901,7 @@ rcu_torture_init(void)
 			fqs_task = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(fqs_task);
 	}
 	if (test_boost_interval < 1)
 		test_boost_interval = 1;
@@ -2027,6 +1932,7 @@ rcu_torture_init(void)
 			shutdown_task = NULL;
 			goto unwind;
 		}
+		torture_shuffle_task_register(shutdown_task);
 		wake_up_process(shutdown_task);
 	}
 	i = rcu_torture_onoff_init();
diff --git a/kernel/torture.c b/kernel/torture.c
index f05042036ae8..26058f20ee83 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -75,6 +75,157 @@ torture_random(struct torture_random_state *trsp)
 }
 EXPORT_SYMBOL_GPL(torture_random);
 
+/*
+ * Variables for shuffling.  The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+	struct list_head st_l;
+	struct task_struct *st_t;
+};
+
+static long shuffle_interval;	/* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu;	/* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+
+/*
+ * Register a task to be shuffled.  If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+	struct shuffle_task *stp;
+
+	if (WARN_ON_ONCE(tp == NULL))
+		return;
+	stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+	if (WARN_ON_ONCE(stp == NULL))
+		return;
+	stp->st_t = tp;
+	mutex_lock(&shuffle_task_mutex);
+	list_add(&stp->st_l, &shuffle_task_list);
+	mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+	struct shuffle_task *stp;
+	struct shuffle_task *p;
+
+	mutex_lock(&shuffle_task_mutex);
+	list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+		list_del(&stp->st_l);
+		kfree(stp);
+	}
+	mutex_unlock(&shuffle_task_mutex);
+}
+
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+	struct shuffle_task *stp;
+
+	cpumask_setall(shuffle_tmp_mask);
+	get_online_cpus();
+
+	/* No point in shuffling if there is only one online CPU (ex: UP) */
+	if (num_online_cpus() == 1) {
+		put_online_cpus();
+		return;
+	}
+
+	/* Advance to the next CPU.  Upon overflow, don't idle any CPUs. */
+	shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+	if (shuffle_idle_cpu >= nr_cpu_ids)
+		shuffle_idle_cpu = -1;
+	if (shuffle_idle_cpu != -1) {
+		cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+		if (cpumask_empty(shuffle_tmp_mask)) {
+			put_online_cpus();
+			return;
+		}
+	}
+
+	mutex_lock(&shuffle_task_mutex);
+	list_for_each_entry(stp, &shuffle_task_list, st_l)
+		set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+	mutex_unlock(&shuffle_task_mutex);
+
+	put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+	VERBOSE_TOROUT_STRING("torture_shuffle task started");
+	do {
+		schedule_timeout_interruptible(shuffle_interval);
+		torture_shuffle_tasks();
+		torture_shutdown_absorb("torture_shuffle");
+	} while (!torture_must_stop());
+	VERBOSE_TOROUT_STRING("torture_shuffle task stopping");
+	return 0;
+}
+
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+	int ret;
+
+	shuffle_interval = shuffint;
+
+	shuffle_idle_cpu = -1;
+
+	if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+		VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+		return -ENOMEM;
+	}
+
+	/* Create the shuffler thread */
+	shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle");
+	if (IS_ERR(shuffler_task)) {
+		ret = PTR_ERR(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
+		VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler");
+		shuffler_task = NULL;
+		return ret;
+	}
+	torture_shuffle_task_register(shuffler_task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+
+/*
+ * Stop the shuffling.
+ */
+void torture_shuffle_cleanup(void)
+{
+	torture_shuffle_task_unregister_all();
+	if (shuffler_task) {
+		VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+		kthread_stop(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
+	}
+	shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+
 /*
  * Absorb kthreads into a kernel function that won't return, so that
  * they won't ever access module text or data again.
-- 
cgit v1.3-14-g43fede


From 2e9e8081d2e7a4efb582a240aa7fee991bbbabb0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 28 Jan 2014 15:58:22 -0800
Subject: rcutorture: Abstract torture_onoff()

Because online/offline torturing is not specific to RCU, this commit
abstracts it into the kernel/torture.c module to allow other torture
tests to use it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  12 ++++
 kernel/rcu/rcutorture.c | 162 ++---------------------------------------
 kernel/torture.c        | 186 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 202 insertions(+), 158 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 544a2c33c1ae..be3694a702e9 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -61,6 +61,18 @@ extern bool verbose;
 #define VERBOSE_TOROUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 
+/* Definitions for a non-string torture-test module parameter. */
+#define torture_parm(type, name, init, msg) \
+	static type name = init; \
+	module_param(name, type, 0444); \
+	MODULE_PARM_DESC(name, msg);
+
+/* Definitions for online/offline exerciser. */
+int torture_onoff_init(long ooholdoff, long oointerval);
+void torture_onoff_cleanup(void);
+char *torture_onoff_stats(char *page);
+bool torture_onoff_failures(void);
+
 /* Low-rider random number generator. */
 struct torture_random_state {
 	unsigned long trs_state;
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 0380696f1844..0e8b52b71d76 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -110,9 +110,6 @@ static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
 static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static struct task_struct *stall_task;
 static struct task_struct **barrier_cbs_tasks;
 static struct task_struct *barrier_task;
@@ -147,16 +144,6 @@ static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
 static long n_barrier_attempts;
 static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
@@ -994,13 +981,7 @@ rcu_torture_printk(char *page)
 		       n_rcu_torture_boost_failure,
 		       n_rcu_torture_boosts,
 		       n_rcu_torture_timers);
-	page += sprintf(page,
-		       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
-		       n_online_successes, n_online_attempts,
-		       n_offline_successes, n_offline_attempts,
-		       min_online, max_online,
-		       min_offline, max_offline,
-		       sum_online, sum_offline, HZ);
+	page = torture_onoff_stats(page);
 	page += sprintf(page, "barrier: %ld/%ld:%ld",
 		       n_barrier_successes,
 		       n_barrier_attempts,
@@ -1204,140 +1185,6 @@ rcu_torture_shutdown(void *arg)
 	return 0;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int
-rcu_torture_onoff(void *arg)
-{
-	int cpu;
-	unsigned long delta;
-	int maxcpu = -1;
-	DEFINE_TORTURE_RANDOM(rand);
-	int ret;
-	unsigned long starttime;
-
-	VERBOSE_TOROUT_STRING("rcu_torture_onoff task started");
-	for_each_online_cpu(cpu)
-		maxcpu = cpu;
-	WARN_ON(maxcpu < 0);
-	if (onoff_holdoff > 0) {
-		VERBOSE_TOROUT_STRING("rcu_torture_onoff begin holdoff");
-		schedule_timeout_interruptible(onoff_holdoff * HZ);
-		VERBOSE_TOROUT_STRING("rcu_torture_onoff end holdoff");
-	}
-	while (!kthread_should_stop()) {
-		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
-		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "rcu_torture_onoff task: offlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_offline_attempts++;
-			ret = cpu_down(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: offline %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: offlined %d\n",
-						 torture_type, cpu);
-				n_offline_successes++;
-				delta = jiffies - starttime;
-				sum_offline += delta;
-				if (min_offline < 0) {
-					min_offline = delta;
-					max_offline = delta;
-				}
-				if (min_offline > delta)
-					min_offline = delta;
-				if (max_offline < delta)
-					max_offline = delta;
-			}
-		} else if (cpu_is_hotpluggable(cpu)) {
-			if (verbose)
-				pr_alert("%s" TORTURE_FLAG
-					 "rcu_torture_onoff task: onlining %d\n",
-					 torture_type, cpu);
-			starttime = jiffies;
-			n_online_attempts++;
-			ret = cpu_up(cpu);
-			if (ret) {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: online %d failed: errno %d\n",
-						 torture_type, cpu, ret);
-			} else {
-				if (verbose)
-					pr_alert("%s" TORTURE_FLAG
-						 "rcu_torture_onoff task: onlined %d\n",
-						 torture_type, cpu);
-				n_online_successes++;
-				delta = jiffies - starttime;
-				sum_online += delta;
-				if (min_online < 0) {
-					min_online = delta;
-					max_online = delta;
-				}
-				if (min_online > delta)
-					min_online = delta;
-				if (max_online < delta)
-					max_online = delta;
-			}
-		}
-		schedule_timeout_interruptible(onoff_interval * HZ);
-	}
-	VERBOSE_TOROUT_STRING("rcu_torture_onoff task stopping");
-	return 0;
-}
-
-static int
-rcu_torture_onoff_init(void)
-{
-	int ret;
-
-	if (onoff_interval <= 0)
-		return 0;
-	onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
-	if (IS_ERR(onoff_task)) {
-		ret = PTR_ERR(onoff_task);
-		onoff_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(onoff_task);
-	return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-	if (onoff_task == NULL)
-		return;
-	VERBOSE_TOROUT_STRING("Stopping rcu_torture_onoff task");
-	kthread_stop(onoff_task);
-	onoff_task = NULL;
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static int
-rcu_torture_onoff_init(void)
-{
-	return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
  * induces a CPU stall for the time specified by stall_cpu.
@@ -1657,7 +1504,7 @@ rcu_torture_cleanup(void)
 		kthread_stop(shutdown_task);
 	}
 	shutdown_task = NULL;
-	rcu_torture_onoff_cleanup();
+	torture_onoff_cleanup();
 
 	/* Wait for all RCU callbacks to fire.  */
 
@@ -1668,8 +1515,7 @@ rcu_torture_cleanup(void)
 
 	if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
 		rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
-	else if (n_online_successes != n_online_attempts ||
-		 n_offline_successes != n_offline_attempts)
+	else if (torture_onoff_failures())
 		rcu_torture_print_module_parms(cur_ops,
 					       "End of test: RCU_HOTPLUG");
 	else
@@ -1935,7 +1781,7 @@ rcu_torture_init(void)
 		torture_shuffle_task_register(shutdown_task);
 		wake_up_process(shutdown_task);
 	}
-	i = rcu_torture_onoff_init();
+	i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
 	if (i != 0) {
 		firsterr = i;
 		goto unwind;
diff --git a/kernel/torture.c b/kernel/torture.c
index 26058f20ee83..a7ec8a7d561e 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -54,6 +54,192 @@ EXPORT_SYMBOL_GPL(fullstop);
 DEFINE_MUTEX(fullstop_mutex);
 EXPORT_SYMBOL_GPL(fullstop_mutex);
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Variables for online-offline handling.  Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+	int cpu;
+	unsigned long delta;
+	int maxcpu = -1;
+	DEFINE_TORTURE_RANDOM(rand);
+	int ret;
+	unsigned long starttime;
+
+	VERBOSE_TOROUT_STRING("torture_onoff task started");
+	for_each_online_cpu(cpu)
+		maxcpu = cpu;
+	WARN_ON(maxcpu < 0);
+	if (onoff_holdoff > 0) {
+		VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+		schedule_timeout_interruptible(onoff_holdoff);
+		VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+	}
+	while (!torture_must_stop()) {
+		cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+		if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+			if (verbose)
+				pr_alert("%s" TORTURE_FLAG
+					 "torture_onoff task: offlining %d\n",
+					 torture_type, cpu);
+			starttime = jiffies;
+			n_offline_attempts++;
+			ret = cpu_down(cpu);
+			if (ret) {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "torture_onoff task: offline %d failed: errno %d\n",
+						 torture_type, cpu, ret);
+			} else {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "torture_onoff task: offlined %d\n",
+						 torture_type, cpu);
+				n_offline_successes++;
+				delta = jiffies - starttime;
+				sum_offline += delta;
+				if (min_offline < 0) {
+					min_offline = delta;
+					max_offline = delta;
+				}
+				if (min_offline > delta)
+					min_offline = delta;
+				if (max_offline < delta)
+					max_offline = delta;
+			}
+		} else if (cpu_is_hotpluggable(cpu)) {
+			if (verbose)
+				pr_alert("%s" TORTURE_FLAG
+					 "torture_onoff task: onlining %d\n",
+					 torture_type, cpu);
+			starttime = jiffies;
+			n_online_attempts++;
+			ret = cpu_up(cpu);
+			if (ret) {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "torture_onoff task: online %d failed: errno %d\n",
+						 torture_type, cpu, ret);
+			} else {
+				if (verbose)
+					pr_alert("%s" TORTURE_FLAG
+						 "torture_onoff task: onlined %d\n",
+						 torture_type, cpu);
+				n_online_successes++;
+				delta = jiffies - starttime;
+				sum_online += delta;
+				if (min_online < 0) {
+					min_online = delta;
+					max_online = delta;
+				}
+				if (min_online > delta)
+					min_online = delta;
+				if (max_online < delta)
+					max_online = delta;
+			}
+		}
+		schedule_timeout_interruptible(onoff_interval);
+	}
+	VERBOSE_TOROUT_STRING("torture_onoff task stopping");
+	return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	int ret;
+
+	onoff_holdoff = ooholdoff;
+	onoff_interval = oointerval;
+	if (onoff_interval <= 0)
+		return 0;
+	onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff");
+	if (IS_ERR(onoff_task)) {
+		ret = PTR_ERR(onoff_task);
+		onoff_task = NULL;
+		return ret;
+	}
+	torture_shuffle_task_register(onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+
+/*
+ * Clean up after online/offline testing.
+ */
+void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	if (onoff_task == NULL)
+		return;
+	VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+	kthread_stop(onoff_task);
+	onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+
+/*
+ * Print online/offline testing statistics.
+ */
+char *torture_onoff_stats(char *page)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	page += sprintf(page,
+		       "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+		       n_online_successes, n_online_attempts,
+		       n_offline_successes, n_offline_attempts,
+		       min_online, max_online,
+		       min_offline, max_offline,
+		       sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+	return page;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+	return n_online_successes != n_online_attempts ||
+	       n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+	return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+
 #define TORTURE_RANDOM_MULT	39916801  /* prime */
 #define TORTURE_RANDOM_ADD	479001701 /* prime */
 #define TORTURE_RANDOM_REFRESH	10000
-- 
cgit v1.3-14-g43fede


From b5daa8f3b3b2b0133ad40e13d4f722070119ce36 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 13:38:09 -0800
Subject: rcutorture: Abstract torture-test initialization

This commit creates torture_init_begin() and torture_init_end() functions
to abstract locking and allow the torture_type and verbose variables
in kernel/torture.o to become static.  With a bit more abstraction,
fullstop_mutex will also become static.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  9 +++++----
 kernel/rcu/rcutorture.c | 19 ++++++++-----------
 kernel/torture.c        | 27 +++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index be3694a702e9..428d2712be04 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -49,10 +49,6 @@ extern int fullstop;
 /* Protect fullstop transitions and spawning of kthreads.  */
 extern struct mutex fullstop_mutex;
 
-/* Common module parameters. */
-extern char *torture_type;
-extern bool verbose;
-
 #define TORTURE_FLAG "-torture:"
 #define TOROUT_STRING(s) \
 	pr_alert("%s" TORTURE_FLAG s "\n", torture_type)
@@ -89,4 +85,9 @@ void torture_shuffle_cleanup(void);
 /* Shutdown task absorption, for when the tasks cannot safely be killed. */
 void torture_shutdown_absorb(const char *title);
 
+/* Initialization and cleanup. */
+
+void torture_init_begin(char *ttype, bool v);
+void torture_init_end(void);
+
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 0e8b52b71d76..93aca2f9261e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -91,15 +91,12 @@ torture_param(int, test_boost_interval, 7,
 	     "Interval between boost tests, seconds.");
 torture_param(bool, test_no_idle_hz, true,
 	     "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+	     "Enable verbose debugging printk()s");
 
-char *torture_type = "rcu";
-EXPORT_SYMBOL_GPL(torture_type);
+static char *torture_type = "rcu";
 module_param(torture_type, charp, 0444);
 MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-bool verbose;
-EXPORT_SYMBOL_GPL(verbose);
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
 
 static int nrealreaders;
 static struct task_struct *writer_task;
@@ -1425,8 +1422,8 @@ rcu_torture_cleanup(void)
 {
 	int i;
 
-	mutex_lock(&fullstop_mutex);
 	rcutorture_record_test_transition();
+	mutex_lock(&fullstop_mutex);
 	if (fullstop == FULLSTOP_SHUTDOWN) {
 		pr_warn(/* but going down anyway, so... */
 		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1589,7 +1586,7 @@ rcu_torture_init(void)
 		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
 	};
 
-	mutex_lock(&fullstop_mutex);
+	torture_init_begin(torture_type, verbose);
 
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1604,7 +1601,7 @@ rcu_torture_init(void)
 		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
 			pr_alert(" %s", torture_ops[i]->name);
 		pr_alert("\n");
-		mutex_unlock(&fullstop_mutex);
+		torture_init_end();
 		return -EINVAL;
 	}
 	if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1800,11 +1797,11 @@ rcu_torture_init(void)
 	if (object_debug)
 		rcu_test_debug_objects();
 	rcutorture_record_test_transition();
-	mutex_unlock(&fullstop_mutex);
+	torture_init_end();
 	return 0;
 
 unwind:
-	mutex_unlock(&fullstop_mutex);
+	torture_init_end();
 	rcu_torture_cleanup();
 	return firsterr;
 }
diff --git a/kernel/torture.c b/kernel/torture.c
index a7ec8a7d561e..828d0b1a49b8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -49,6 +49,9 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 
+static char *torture_type;
+static bool verbose;
+
 int fullstop = FULLSTOP_RMMOD;
 EXPORT_SYMBOL_GPL(fullstop);
 DEFINE_MUTEX(fullstop_mutex);
@@ -426,3 +429,27 @@ void torture_shutdown_absorb(const char *title)
 	}
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+
+/*
+ * Initialize torture module.  Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module.  This call must be paired with a later
+ * torture_init_end().
+ */
+void __init torture_init_begin(char *ttype, bool v)
+{
+	mutex_lock(&fullstop_mutex);
+	torture_type = ttype;
+	verbose = v;
+
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void __init torture_init_end(void)
+{
+	mutex_unlock(&fullstop_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
-- 
cgit v1.3-14-g43fede


From cc47ae0830264f07442070b36fe0d0a4d4e3c313 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 14:21:11 -0800
Subject: rcutorture: Abstract torture-test cleanup

This commit creates a torture_cleanup() that handles the generic
cleanup actions local to kernel/torture.c.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  4 +---
 kernel/rcu/rcutorture.c | 11 +----------
 kernel/torture.c        | 30 ++++++++++++++++++++++++++++--
 3 files changed, 30 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 428d2712be04..31d69930e5cf 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -65,7 +65,6 @@ extern struct mutex fullstop_mutex;
 
 /* Definitions for online/offline exerciser. */
 int torture_onoff_init(long ooholdoff, long oointerval);
-void torture_onoff_cleanup(void);
 char *torture_onoff_stats(char *page);
 bool torture_onoff_failures(void);
 
@@ -80,14 +79,13 @@ unsigned long torture_random(struct torture_random_state *trsp);
 /* Task shuffler, which causes CPUs to occasionally go idle. */
 void torture_shuffle_task_register(struct task_struct *tp);
 int torture_shuffle_init(long shuffint);
-void torture_shuffle_cleanup(void);
 
 /* Shutdown task absorption, for when the tasks cannot safely be killed. */
 void torture_shutdown_absorb(const char *title);
 
 /* Initialization and cleanup. */
-
 void torture_init_begin(char *ttype, bool v);
 void torture_init_end(void);
+bool torture_cleanup(void);
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 93aca2f9261e..68a689fc6ffa 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1423,21 +1423,13 @@ rcu_torture_cleanup(void)
 	int i;
 
 	rcutorture_record_test_transition();
-	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_SHUTDOWN) {
-		pr_warn(/* but going down anyway, so... */
-		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-		mutex_unlock(&fullstop_mutex);
-		schedule_timeout_uninterruptible(10);
+	if (torture_cleanup()) {
 		if (cur_ops->cb_barrier != NULL)
 			cur_ops->cb_barrier();
 		return;
 	}
-	fullstop = FULLSTOP_RMMOD;
-	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 
-	torture_shuffle_cleanup(); /* Must be first task cleaned up. */
 	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
 	if (stutter_task) {
@@ -1501,7 +1493,6 @@ rcu_torture_cleanup(void)
 		kthread_stop(shutdown_task);
 	}
 	shutdown_task = NULL;
-	torture_onoff_cleanup();
 
 	/* Wait for all RCU callbacks to fire.  */
 
diff --git a/kernel/torture.c b/kernel/torture.c
index 828d0b1a49b8..41ae5cc3c4c3 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -199,7 +199,7 @@ EXPORT_SYMBOL_GPL(torture_onoff_init);
 /*
  * Clean up after online/offline testing.
  */
-void torture_onoff_cleanup(void)
+static void torture_onoff_cleanup(void)
 {
 #ifdef CONFIG_HOTPLUG_CPU
 	if (onoff_task == NULL)
@@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(torture_shuffle_init);
 /*
  * Stop the shuffling.
  */
-void torture_shuffle_cleanup(void)
+static void torture_shuffle_cleanup(void)
 {
 	torture_shuffle_task_unregister_all();
 	if (shuffler_task) {
@@ -453,3 +453,29 @@ void __init torture_init_end(void)
 	mutex_unlock(&fullstop_mutex);
 }
 EXPORT_SYMBOL_GPL(torture_init_end);
+
+/*
+ * Clean up torture module.  Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module.  Returns true if a race with system shutdown
+ * is detected.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ */
+bool torture_cleanup(void)
+{
+	mutex_lock(&fullstop_mutex);
+	if (fullstop == FULLSTOP_SHUTDOWN) {
+		pr_warn("Concurrent rmmod and shutdown illegal!\n");
+		mutex_unlock(&fullstop_mutex);
+		schedule_timeout_uninterruptible(10);
+		return true;
+	}
+	fullstop = FULLSTOP_RMMOD;
+	mutex_unlock(&fullstop_mutex);
+	torture_shuffle_cleanup();
+	torture_onoff_cleanup();
+	return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup);
-- 
cgit v1.3-14-g43fede


From 4622b487ecf0094401ac10e504606e5cbdea5a6e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 15:37:19 -0800
Subject: rcutorture: Abstract torture_shutdown_notify()

Because handling the race between rmmod and system shutdown is not
specific to RCU, this commit abstracts torture_shutdown_notify(),
placing this code into kernel/torture.c.  This change also allows
fullstop_mutex to be private to kernel/torture.c.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  2 --
 kernel/rcu/rcutorture.c | 23 -----------------------
 kernel/torture.c        | 29 ++++++++++++++++++++++++-----
 3 files changed, 24 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 31d69930e5cf..742d8a402f19 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -46,8 +46,6 @@
 #define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
 #define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
 extern int fullstop;
-/* Protect fullstop transitions and spawning of kthreads.  */
-extern struct mutex fullstop_mutex;
 
 #define TORTURE_FLAG "-torture:"
 #define TOROUT_STRING(s) \
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 68a689fc6ffa..2560e9313887 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -189,23 +189,6 @@ static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 /* Forward reference. */
 static void rcu_torture_cleanup(void);
 
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
-			   unsigned long unused2, void *unused3)
-{
-	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_DONTSTOP)
-		fullstop = FULLSTOP_SHUTDOWN;
-	else
-		pr_warn(/* but going down anyway, so... */
-		       "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
-	mutex_unlock(&fullstop_mutex);
-	return NOTIFY_DONE;
-}
-
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -1098,10 +1081,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 onoff_interval, onoff_holdoff);
 }
 
-static struct notifier_block rcutorture_shutdown_nb = {
-	.notifier_call = rcutorture_shutdown_notify,
-};
-
 static void rcutorture_booster_cleanup(int cpu)
 {
 	struct task_struct *t;
@@ -1428,7 +1407,6 @@ rcu_torture_cleanup(void)
 			cur_ops->cb_barrier();
 		return;
 	}
-	unregister_reboot_notifier(&rcutorture_shutdown_nb);
 
 	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
@@ -1774,7 +1752,6 @@ rcu_torture_init(void)
 		firsterr = i;
 		goto unwind;
 	}
-	register_reboot_notifier(&rcutorture_shutdown_nb);
 	i = rcu_torture_stall_init();
 	if (i != 0) {
 		firsterr = i;
diff --git a/kernel/torture.c b/kernel/torture.c
index 41ae5cc3c4c3..b02fa2785bbb 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -54,8 +54,7 @@ static bool verbose;
 
 int fullstop = FULLSTOP_RMMOD;
 EXPORT_SYMBOL_GPL(fullstop);
-DEFINE_MUTEX(fullstop_mutex);
-EXPORT_SYMBOL_GPL(fullstop_mutex);
+static DEFINE_MUTEX(fullstop_mutex);
 
 #ifdef CONFIG_HOTPLUG_CPU
 
@@ -422,14 +421,32 @@ EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
 void torture_shutdown_absorb(const char *title)
 {
 	while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
-		pr_notice(
-		       "torture thread %s parking due to system shutdown\n",
-		       title);
+		pr_notice("torture thread %s parking due to system shutdown\n",
+			  title);
 		schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
 	}
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
 
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+				   unsigned long unused2, void *unused3)
+{
+	mutex_lock(&fullstop_mutex);
+	if (fullstop == FULLSTOP_DONTSTOP)
+		fullstop = FULLSTOP_SHUTDOWN;
+	else
+		pr_warn("Concurrent rmmod and shutdown illegal!\n");
+	mutex_unlock(&fullstop_mutex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block torture_shutdown_nb = {
+	.notifier_call = torture_shutdown_notify,
+};
+
 /*
  * Initialize torture module.  Please note that this is -not- invoked via
  * the usual module_init() mechanism, but rather by an explicit call from
@@ -451,6 +468,7 @@ EXPORT_SYMBOL_GPL(torture_init_begin);
 void __init torture_init_end(void)
 {
 	mutex_unlock(&fullstop_mutex);
+	register_reboot_notifier(&torture_shutdown_nb);
 }
 EXPORT_SYMBOL_GPL(torture_init_end);
 
@@ -474,6 +492,7 @@ bool torture_cleanup(void)
 	}
 	fullstop = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
+	unregister_reboot_notifier(&torture_shutdown_nb);
 	torture_shuffle_cleanup();
 	torture_onoff_cleanup();
 	return false;
-- 
cgit v1.3-14-g43fede


From 36970bb91d89618d3495babf44b934e9c9db6bbc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 15:49:29 -0800
Subject: rcutorture: Privatize fullstop

This commit introduces the torture_must_stop() function in order to
keep use of the fullstop variable local to kernel/torture.c.  There
is also a torture_must_stop_irq() counterpart for use from RCU callbacks,
timeout handlers, and the like.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  8 ++------
 kernel/rcu/rcutorture.c | 38 +++++++++++++++-----------------------
 kernel/torture.c        | 27 +++++++++++++++++++++++++--
 3 files changed, 42 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 742d8a402f19..0259db38bfb0 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -41,12 +41,6 @@
 	module_param(name, type, 0444); \
 	MODULE_PARM_DESC(name, msg);
 
-/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
-#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1	/* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD    2	/* Normal rmmod of rcutorture. */
-extern int fullstop;
-
 #define TORTURE_FLAG "-torture:"
 #define TOROUT_STRING(s) \
 	pr_alert("%s" TORTURE_FLAG s "\n", torture_type)
@@ -85,5 +79,7 @@ void torture_shutdown_absorb(const char *title);
 void torture_init_begin(char *ttype, bool v);
 void torture_init_end(void);
 bool torture_cleanup(void);
+bool torture_must_stop(void);
+bool torture_must_stop_irq(void);
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2560e9313887..9357c88cc8cc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -304,7 +304,7 @@ rcu_torture_cb(struct rcu_head *p)
 	int i;
 	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
 
-	if (fullstop != FULLSTOP_DONTSTOP) {
+	if (torture_must_stop_irq()) {
 		/* Test is ending, just drop callbacks on the floor. */
 		/* The next initialization will pick up the pieces. */
 		return;
@@ -572,8 +572,7 @@ static int rcu_torture_boost(void *arg)
 		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
 			schedule_timeout_interruptible(oldstarttime - jiffies);
 			rcu_stutter_wait("rcu_torture_boost");
-			if (kthread_should_stop() ||
-			    fullstop != FULLSTOP_DONTSTOP)
+			if (torture_must_stop())
 				goto checkwait;
 		}
 
@@ -595,8 +594,7 @@ static int rcu_torture_boost(void *arg)
 			}
 			cond_resched();
 			rcu_stutter_wait("rcu_torture_boost");
-			if (kthread_should_stop() ||
-			    fullstop != FULLSTOP_DONTSTOP)
+			if (torture_must_stop())
 				goto checkwait;
 		}
 
@@ -621,7 +619,7 @@ static int rcu_torture_boost(void *arg)
 
 		/* Go do the stutter. */
 checkwait:	rcu_stutter_wait("rcu_torture_boost");
-	} while (!kthread_should_stop() && fullstop  == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 
 	/* Clean up and exit. */
 	VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping");
@@ -659,7 +657,7 @@ rcu_torture_fqs(void *arg)
 			fqs_burst_remaining -= fqs_holdoff;
 		}
 		rcu_stutter_wait("rcu_torture_fqs");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping");
 	torture_shutdown_absorb("rcu_torture_fqs");
 	while (!kthread_should_stop())
@@ -731,7 +729,7 @@ rcu_torture_writer(void *arg)
 		}
 		rcutorture_record_progress(++rcu_torture_current_version);
 		rcu_stutter_wait("rcu_torture_writer");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping");
 	torture_shutdown_absorb("rcu_torture_writer");
 	while (!kthread_should_stop())
@@ -768,7 +766,7 @@ rcu_torture_fakewriter(void *arg)
 			cur_ops->exp_sync();
 		}
 		rcu_stutter_wait("rcu_torture_fakewriter");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping");
 	torture_shutdown_absorb("rcu_torture_fakewriter");
@@ -913,7 +911,7 @@ rcu_torture_reader(void *arg)
 		cur_ops->readunlock(idx);
 		schedule();
 		rcu_stutter_wait("rcu_torture_reader");
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping");
 	torture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irq_capable)
@@ -1022,9 +1020,6 @@ rcu_torture_stats_print(void)
 /*
  * Periodically prints torture statistics, if periodic statistics printing
  * was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
  */
 static int
 rcu_torture_stats(void *arg)
@@ -1034,7 +1029,7 @@ rcu_torture_stats(void *arg)
 		schedule_timeout_interruptible(stat_interval * HZ);
 		rcu_torture_stats_print();
 		torture_shutdown_absorb("rcu_torture_stats");
-	} while (!kthread_should_stop());
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping");
 	return 0;
 }
@@ -1241,16 +1236,15 @@ static int rcu_torture_barrier_cbs(void *arg)
 		wait_event(barrier_cbs_wq[myid],
 			   (newphase =
 			    ACCESS_ONCE(barrier_phase)) != lastphase ||
-			   kthread_should_stop() ||
-			   fullstop != FULLSTOP_DONTSTOP);
+			   torture_must_stop());
 		lastphase = newphase;
 		smp_mb(); /* ensure barrier_phase load before ->call(). */
-		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+		if (torture_must_stop())
 			break;
 		cur_ops->call(&rcu, rcu_torture_barrier_cbf);
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping");
 	torture_shutdown_absorb("rcu_torture_barrier_cbs");
 	while (!kthread_should_stop())
@@ -1275,9 +1269,8 @@ static int rcu_torture_barrier(void *arg)
 			wake_up(&barrier_cbs_wq[i]);
 		wait_event(barrier_wq,
 			   atomic_read(&barrier_cbs_count) == 0 ||
-			   kthread_should_stop() ||
-			   fullstop != FULLSTOP_DONTSTOP);
-		if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+			   torture_must_stop());
+		if (torture_must_stop())
 			break;
 		n_barrier_attempts++;
 		cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1287,7 +1280,7 @@ static int rcu_torture_barrier(void *arg)
 		}
 		n_barrier_successes++;
 		schedule_timeout_interruptible(HZ / 10);
-	} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping");
 	torture_shutdown_absorb("rcu_torture_barrier");
 	while (!kthread_should_stop())
@@ -1585,7 +1578,6 @@ rcu_torture_init(void)
 	else
 		nrealreaders = 2 * num_online_cpus();
 	rcu_torture_print_module_parms(cur_ops, "Start of test");
-	fullstop = FULLSTOP_DONTSTOP;
 
 	/* Set up the freelist. */
 
diff --git a/kernel/torture.c b/kernel/torture.c
index b02fa2785bbb..ed360cf948da 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -52,8 +52,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
 static char *torture_type;
 static bool verbose;
 
-int fullstop = FULLSTOP_RMMOD;
-EXPORT_SYMBOL_GPL(fullstop);
+/* Mediate rmmod and system shutdown.  Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0	/* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1	/* System shutdown with torture running. */
+#define FULLSTOP_RMMOD    2	/* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
 static DEFINE_MUTEX(fullstop_mutex);
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -458,6 +461,7 @@ void __init torture_init_begin(char *ttype, bool v)
 	mutex_lock(&fullstop_mutex);
 	torture_type = ttype;
 	verbose = v;
+	fullstop = FULLSTOP_DONTSTOP;
 
 }
 EXPORT_SYMBOL_GPL(torture_init_begin);
@@ -498,3 +502,22 @@ bool torture_cleanup(void)
 	return false;
 }
 EXPORT_SYMBOL_GPL(torture_cleanup);
+
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+	return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+
+/*
+ * Is it time for the current torture test to stop?  This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+	return fullstop != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
-- 
cgit v1.3-14-g43fede


From fac480efcba6a9f0aea91947f151fd569538b0af Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 30 Jan 2014 17:06:30 -0800
Subject: rcutorture: Add diagnostic for unscheduled system shutdown

Currently, rcutorture can terminate via rmmod, via self-shutdown,
via something else shutting the system down, or of course the usual
catastrophic termination.  The first two get flagged, so this commit adds
a message for the third.  For the fourth, your warranty is void as always.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/torture.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index ed360cf948da..d51de3029a5c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -438,10 +438,12 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
 				   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_DONTSTOP)
+	if (fullstop == FULLSTOP_DONTSTOP) {
+		VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
 		fullstop = FULLSTOP_SHUTDOWN;
-	else
+	} else {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
+	}
 	mutex_unlock(&fullstop_mutex);
 	return NOTIFY_DONE;
 }
-- 
cgit v1.3-14-g43fede


From 628edaa5062282b6e3d76c886fd2cbccae5cb87b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 11:57:43 -0800
Subject: rcutorture: Abstract stutter_wait()

Because stuttering the test load (stopping and restarting it) is useful
for non-RCU testing, this commit moves the load-stuttering functionality
to kernel/torture.c.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  7 +++-
 kernel/rcu/rcutorture.c | 69 +++++++--------------------------------
 kernel/torture.c        | 87 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 104 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 0259db38bfb0..203f127d9ddf 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -75,8 +75,13 @@ int torture_shuffle_init(long shuffint);
 /* Shutdown task absorption, for when the tasks cannot safely be killed. */
 void torture_shutdown_absorb(const char *title);
 
+/* Task stuttering, which forces load/no-load transitions. */
+void stutter_wait(const char *title);
+int torture_stutter_init(int s);
+void torture_stutter_cleanup(void);
+
 /* Initialization and cleanup. */
-void torture_init_begin(char *ttype, bool v);
+void torture_init_begin(char *ttype, bool v, int *runnable);
 void torture_init_end(void);
 bool torture_cleanup(void);
 bool torture_must_stop(void);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 9357c88cc8cc..4329ad14f8dc 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -103,7 +103,6 @@ static struct task_struct *writer_task;
 static struct task_struct **fakewriter_tasks;
 static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
-static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
 static struct task_struct *shutdown_task;
@@ -145,8 +144,6 @@ static long n_barrier_attempts;
 static long n_barrier_successes;
 static struct list_head rcu_torture_removed;
 
-static int stutter_pause_test;
-
 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
 #define RCUTORTURE_RUNNABLE_INIT 1
 #else
@@ -222,18 +219,6 @@ rcu_torture_free(struct rcu_torture *p)
 	spin_unlock_bh(&rcu_torture_lock);
 }
 
-static void
-rcu_stutter_wait(const char *title)
-{
-	while (stutter_pause_test || !rcutorture_runnable) {
-		if (rcutorture_runnable)
-			schedule_timeout_interruptible(1);
-		else
-			schedule_timeout_interruptible(round_jiffies_relative(HZ));
-		torture_shutdown_absorb(title);
-	}
-}
-
 /*
  * Operations vector for selecting different types of tests.
  */
@@ -571,7 +556,7 @@ static int rcu_torture_boost(void *arg)
 		oldstarttime = boost_starttime;
 		while (ULONG_CMP_LT(jiffies, oldstarttime)) {
 			schedule_timeout_interruptible(oldstarttime - jiffies);
-			rcu_stutter_wait("rcu_torture_boost");
+			stutter_wait("rcu_torture_boost");
 			if (torture_must_stop())
 				goto checkwait;
 		}
@@ -593,7 +578,7 @@ static int rcu_torture_boost(void *arg)
 				call_rcu_time = jiffies;
 			}
 			cond_resched();
-			rcu_stutter_wait("rcu_torture_boost");
+			stutter_wait("rcu_torture_boost");
 			if (torture_must_stop())
 				goto checkwait;
 		}
@@ -618,7 +603,7 @@ static int rcu_torture_boost(void *arg)
 		}
 
 		/* Go do the stutter. */
-checkwait:	rcu_stutter_wait("rcu_torture_boost");
+checkwait:	stutter_wait("rcu_torture_boost");
 	} while (!torture_must_stop());
 
 	/* Clean up and exit. */
@@ -656,7 +641,7 @@ rcu_torture_fqs(void *arg)
 			udelay(fqs_holdoff);
 			fqs_burst_remaining -= fqs_holdoff;
 		}
-		rcu_stutter_wait("rcu_torture_fqs");
+		stutter_wait("rcu_torture_fqs");
 	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping");
 	torture_shutdown_absorb("rcu_torture_fqs");
@@ -728,7 +713,7 @@ rcu_torture_writer(void *arg)
 			}
 		}
 		rcutorture_record_progress(++rcu_torture_current_version);
-		rcu_stutter_wait("rcu_torture_writer");
+		stutter_wait("rcu_torture_writer");
 	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping");
 	torture_shutdown_absorb("rcu_torture_writer");
@@ -765,7 +750,7 @@ rcu_torture_fakewriter(void *arg)
 		} else {
 			cur_ops->exp_sync();
 		}
-		rcu_stutter_wait("rcu_torture_fakewriter");
+		stutter_wait("rcu_torture_fakewriter");
 	} while (!torture_must_stop());
 
 	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping");
@@ -910,7 +895,7 @@ rcu_torture_reader(void *arg)
 		preempt_enable();
 		cur_ops->readunlock(idx);
 		schedule();
-		rcu_stutter_wait("rcu_torture_reader");
+		stutter_wait("rcu_torture_reader");
 	} while (!torture_must_stop());
 	VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping");
 	torture_shutdown_absorb("rcu_torture_reader");
@@ -1034,25 +1019,6 @@ rcu_torture_stats(void *arg)
 	return 0;
 }
 
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
-	VERBOSE_TOROUT_STRING("rcu_torture_stutter task started");
-	do {
-		schedule_timeout_interruptible(stutter * HZ);
-		stutter_pause_test = 1;
-		if (!kthread_should_stop())
-			schedule_timeout_interruptible(stutter * HZ);
-		stutter_pause_test = 0;
-		torture_shutdown_absorb("rcu_torture_stutter");
-	} while (!kthread_should_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_stutter task stopping");
-	return 0;
-}
-
 static inline void
 rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 {
@@ -1403,11 +1369,7 @@ rcu_torture_cleanup(void)
 
 	rcu_torture_barrier_cleanup();
 	rcu_torture_stall_cleanup();
-	if (stutter_task) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_stutter task");
-		kthread_stop(stutter_task);
-	}
-	stutter_task = NULL;
+	torture_stutter_cleanup();
 
 	if (writer_task) {
 		VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task");
@@ -1548,7 +1510,7 @@ rcu_torture_init(void)
 		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
 	};
 
-	torture_init_begin(torture_type, verbose);
+	torture_init_begin(torture_type, verbose, &rcutorture_runnable);
 
 	/* Process args and tell the world that the torturer is on the job. */
 	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1682,21 +1644,14 @@ rcu_torture_init(void)
 	if (stutter < 0)
 		stutter = 0;
 	if (stutter) {
-		/* Create the stutter thread */
-		stutter_task = kthread_run(rcu_torture_stutter, NULL,
-					  "rcu_torture_stutter");
-		if (IS_ERR(stutter_task)) {
-			firsterr = PTR_ERR(stutter_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create stutter");
-			stutter_task = NULL;
+		firsterr = torture_stutter_init(stutter * HZ);
+		if (firsterr)
 			goto unwind;
-		}
-		torture_shuffle_task_register(stutter_task);
 	}
 	if (fqs_duration < 0)
 		fqs_duration = 0;
 	if (fqs_duration) {
-		/* Create the stutter thread */
+		/* Create the fqs thread */
 		fqs_task = kthread_run(rcu_torture_fqs, NULL,
 				       "rcu_torture_fqs");
 		if (IS_ERR(fqs_task)) {
diff --git a/kernel/torture.c b/kernel/torture.c
index d51de3029a5c..b30c2ee78580 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -58,6 +58,7 @@ static bool verbose;
 #define FULLSTOP_RMMOD    2	/* Normal rmmod of torture. */
 static int fullstop = FULLSTOP_RMMOD;
 static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
 
 #ifdef CONFIG_HOTPLUG_CPU
 
@@ -452,17 +453,101 @@ static struct notifier_block torture_shutdown_nb = {
 	.notifier_call = torture_shutdown_notify,
 };
 
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+
+/*
+ * Block until the stutter interval ends.  This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+	while (ACCESS_ONCE(stutter_pause_test) ||
+	       (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+		if (stutter_pause_test)
+			schedule_timeout_interruptible(1);
+		else
+			schedule_timeout_interruptible(round_jiffies_relative(HZ));
+		torture_shutdown_absorb(title);
+	}
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+	VERBOSE_TOROUT_STRING("torture_stutter task started");
+	do {
+		if (!torture_must_stop()) {
+			schedule_timeout_interruptible(stutter);
+			ACCESS_ONCE(stutter_pause_test) = 1;
+		}
+		if (!torture_must_stop())
+			schedule_timeout_interruptible(stutter);
+		ACCESS_ONCE(stutter_pause_test) = 0;
+		torture_shutdown_absorb("torture_stutter");
+	} while (!torture_must_stop());
+	VERBOSE_TOROUT_STRING("torture_stutter task stopping");
+	return 0;
+}
+
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+	int ret;
+
+	stutter = s;
+	stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter");
+	if (IS_ERR(stutter_task)) {
+		ret = PTR_ERR(stutter_task);
+		VERBOSE_TOROUT_ERRSTRING("Failed to create stutter");
+		stutter_task = NULL;
+		return ret;
+	}
+	torture_shuffle_task_register(stutter_task);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+void torture_stutter_cleanup(void)
+{
+	if (!stutter_task)
+		return;
+	VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+	kthread_stop(stutter_task);
+	stutter_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_cleanup);
+
 /*
  * Initialize torture module.  Please note that this is -not- invoked via
  * the usual module_init() mechanism, but rather by an explicit call from
  * the client torture module.  This call must be paired with a later
  * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable.  If there is no such flag, pass in NULL.
  */
-void __init torture_init_begin(char *ttype, bool v)
+void __init torture_init_begin(char *ttype, bool v, int *runnable)
 {
 	mutex_lock(&fullstop_mutex);
 	torture_type = ttype;
 	verbose = v;
+	torture_runnable = runnable;
 	fullstop = FULLSTOP_DONTSTOP;
 
 }
-- 
cgit v1.3-14-g43fede


From 57a2fe90fcdaa812ac1aa6c91ba0e591c30f461a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 12:58:39 -0800
Subject: rcutorture: Apply ACCESS_ONCE() to racy fullstop accesses

Because the fullstop variable can be accessed while it is being updated,
this commit avoids any resulting compiler mischief through use of
ACCESS_ONCE() for non-initialization accesses to this shared variable.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/torture.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index b30c2ee78580..1bafd02d1eed 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -439,9 +439,9 @@ static int torture_shutdown_notify(struct notifier_block *unused1,
 				   unsigned long unused2, void *unused3)
 {
 	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_DONTSTOP) {
+	if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
 		VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
-		fullstop = FULLSTOP_SHUTDOWN;
+		ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
 	} else {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 	}
@@ -575,13 +575,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
 bool torture_cleanup(void)
 {
 	mutex_lock(&fullstop_mutex);
-	if (fullstop == FULLSTOP_SHUTDOWN) {
+	if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
 		pr_warn("Concurrent rmmod and shutdown illegal!\n");
 		mutex_unlock(&fullstop_mutex);
 		schedule_timeout_uninterruptible(10);
 		return true;
 	}
-	fullstop = FULLSTOP_RMMOD;
+	ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
 	unregister_reboot_notifier(&torture_shutdown_nb);
 	torture_shuffle_cleanup();
@@ -605,6 +605,6 @@ EXPORT_SYMBOL_GPL(torture_must_stop);
  */
 bool torture_must_stop_irq(void)
 {
-	return fullstop != FULLSTOP_DONTSTOP;
+	return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
 }
 EXPORT_SYMBOL_GPL(torture_must_stop_irq);
-- 
cgit v1.3-14-g43fede


From e991dbc0770b01b7dc7d6d7660442e83ebd11828 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 14:52:13 -0800
Subject: rcutorture: Abstract torture_shutdown()

Because auto-shutdown of torture testing is not specific to RCU,
this commit moves the auto-shutdown function to kernel/torture.c.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  4 ++-
 kernel/rcu/rcutorture.c | 63 +++----------------------------------
 kernel/torture.c        | 84 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 203f127d9ddf..c79c41d543ef 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -72,8 +72,10 @@ unsigned long torture_random(struct torture_random_state *trsp);
 void torture_shuffle_task_register(struct task_struct *tp);
 int torture_shuffle_init(long shuffint);
 
-/* Shutdown task absorption, for when the tasks cannot safely be killed. */
+/* Test auto-shutdown handling. */
 void torture_shutdown_absorb(const char *title);
+int torture_shutdown_init(int ssecs, void (*cleanup)(void));
+void torture_shutdown_cleanup(void);
 
 /* Task stuttering, which forces load/no-load transitions. */
 void stutter_wait(const char *title);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 4329ad14f8dc..897b0f91f899 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -105,7 +105,6 @@ static struct task_struct **reader_tasks;
 static struct task_struct *stats_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
 static struct task_struct *stall_task;
 static struct task_struct **barrier_cbs_tasks;
 static struct task_struct *barrier_task;
@@ -173,7 +172,6 @@ static u64 notrace rcu_trace_clock_local(void)
 }
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 
-static unsigned long shutdown_time;	/* jiffies to system shutdown. */
 static unsigned long boost_starttime;	/* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);		/* protect setting boost_starttime */
 					/*  and boost task create/destroy. */
@@ -183,9 +181,6 @@ static atomic_t barrier_cbs_invoked;	/* Barrier callbacks invoked. */
 static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
 static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
 
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -1086,42 +1081,6 @@ static int rcutorture_booster_init(int cpu)
 	return 0;
 }
 
-/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
-	long delta;
-	unsigned long jiffies_snap;
-
-	VERBOSE_TOROUT_STRING("rcu_torture_shutdown task started");
-	jiffies_snap = ACCESS_ONCE(jiffies);
-	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
-	       !kthread_should_stop()) {
-		delta = shutdown_time - jiffies_snap;
-		if (verbose)
-			pr_alert("%s" TORTURE_FLAG
-				 "rcu_torture_shutdown task: %lu jiffies remaining\n",
-				 torture_type, delta);
-		schedule_timeout_interruptible(delta);
-		jiffies_snap = ACCESS_ONCE(jiffies);
-	}
-	if (kthread_should_stop()) {
-		VERBOSE_TOROUT_STRING("rcu_torture_shutdown task stopping");
-		return 0;
-	}
-
-	/* OK, shut down the system. */
-
-	VERBOSE_TOROUT_STRING("rcu_torture_shutdown task shutting down system");
-	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
-	rcu_torture_cleanup();	/* Get the success/failure message. */
-	kernel_power_off();	/* Shut down the system. */
-	return 0;
-}
-
 /*
  * CPU-stall kthread.  It waits as specified by stall_cpu_holdoff, then
  * induces a CPU stall for the time specified by stall_cpu.
@@ -1421,11 +1380,7 @@ rcu_torture_cleanup(void)
 		for_each_possible_cpu(i)
 			rcutorture_booster_cleanup(i);
 	}
-	if (shutdown_task != NULL) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_shutdown task");
-		kthread_stop(shutdown_task);
-	}
-	shutdown_task = NULL;
+	torture_shutdown_cleanup();
 
 	/* Wait for all RCU callbacks to fire.  */
 
@@ -1681,18 +1636,10 @@ rcu_torture_init(void)
 			}
 		}
 	}
-	if (shutdown_secs > 0) {
-		shutdown_time = jiffies + shutdown_secs * HZ;
-		shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
-					       "rcu_torture_shutdown");
-		if (IS_ERR(shutdown_task)) {
-			firsterr = PTR_ERR(shutdown_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown");
-			shutdown_task = NULL;
-			goto unwind;
-		}
-		torture_shuffle_task_register(shutdown_task);
-		wake_up_process(shutdown_task);
+	i = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+	if (i != 0) {
+		firsterr = i;
+		goto unwind;
 	}
 	i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
 	if (i != 0) {
diff --git a/kernel/torture.c b/kernel/torture.c
index 1bafd02d1eed..df2c700e96e4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -418,6 +418,15 @@ static void torture_shuffle_cleanup(void)
 }
 EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
 
+/*
+ * Variables for auto-shutdown.  This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs;		/* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time;	/* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+
 /*
  * Absorb kthreads into a kernel function that won't return, so that
  * they won't ever access module text or data again.
@@ -432,6 +441,81 @@ void torture_shutdown_absorb(const char *title)
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
 
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+	long delta;
+	unsigned long jiffies_snap;
+
+	VERBOSE_TOROUT_STRING("torture_shutdown task started");
+	jiffies_snap = jiffies;
+	while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+	       !torture_must_stop()) {
+		delta = shutdown_time - jiffies_snap;
+		if (verbose)
+			pr_alert("%s" TORTURE_FLAG
+				 "torture_shutdown task: %lu jiffies remaining\n",
+				 torture_type, delta);
+		schedule_timeout_interruptible(delta);
+		jiffies_snap = jiffies;
+	}
+	if (torture_must_stop()) {
+		VERBOSE_TOROUT_STRING("torture_shutdown task stopping");
+		return 0;
+	}
+
+	/* OK, shut down the system. */
+
+	VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
+	torture_shutdown_hook();/* Shut down the enclosing torture test. */
+	kernel_power_off();	/* Shut down the system. */
+	return 0;
+}
+
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+	int ret;
+
+	shutdown_secs = ssecs;
+	torture_shutdown_hook = cleanup;
+	if (shutdown_secs > 0) {
+		shutdown_time = jiffies + shutdown_secs * HZ;
+		shutdown_task = kthread_create(torture_shutdown, NULL,
+					       "torture_shutdown");
+		if (IS_ERR(shutdown_task)) {
+			ret = PTR_ERR(shutdown_task);
+			VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown");
+			shutdown_task = NULL;
+			return ret;
+		}
+		torture_shuffle_task_register(shutdown_task);
+		wake_up_process(shutdown_task);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+
+/*
+ * Shut down the shutdown task.  Say what???  Heh!  This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives.  ;-)
+ */
+void torture_shutdown_cleanup(void)
+{
+	if (shutdown_task != NULL) {
+		VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+		kthread_stop(shutdown_task);
+	}
+	shutdown_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_cleanup);
+
 /*
  * Detect and respond to a system shutdown.
  */
-- 
cgit v1.3-14-g43fede


From 01025ebc99e39ac962c32e063cad9a3012ee8b0a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 15:15:02 -0800
Subject: rcutorture: Clean up rcu_torture_init() error checking

This commit applies some simple cleanups to rcu_torture_init() error
checking.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 897b0f91f899..746c4278ea5e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1460,7 +1460,6 @@ rcu_torture_init(void)
 	int i;
 	int cpu;
 	int firsterr = 0;
-	int retval;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
 	};
@@ -1629,33 +1628,23 @@ rcu_torture_init(void)
 		for_each_possible_cpu(i) {
 			if (cpu_is_offline(i))
 				continue;  /* Heuristic: CPU can go offline. */
-			retval = rcutorture_booster_init(i);
-			if (retval < 0) {
-				firsterr = retval;
+			firsterr = rcutorture_booster_init(i);
+			if (firsterr)
 				goto unwind;
-			}
 		}
 	}
-	i = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
-	if (i != 0) {
-		firsterr = i;
+	firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+	if (firsterr)
 		goto unwind;
-	}
-	i = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
-	if (i != 0) {
-		firsterr = i;
+	firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+	if (firsterr)
 		goto unwind;
-	}
-	i = rcu_torture_stall_init();
-	if (i != 0) {
-		firsterr = i;
+	firsterr = rcu_torture_stall_init();
+	if (firsterr)
 		goto unwind;
-	}
-	retval = rcu_torture_barrier_init();
-	if (retval != 0) {
-		firsterr = retval;
+	firsterr = rcu_torture_barrier_init();
+	if (firsterr)
 		goto unwind;
-	}
 	if (object_debug)
 		rcu_test_debug_objects();
 	rcutorture_record_test_transition();
-- 
cgit v1.3-14-g43fede


From 14562d1cf12b434da2c69b5603a4149ac43f3b48 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 15:39:52 -0800
Subject: rcutorture: Announce task creation

A few "stealth-start rcutorture kthreads" have accumulated over the years,
so this commit adds console-log announcements (but only if the torture
tests are running verbose).

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 4 ++++
 kernel/torture.c        | 4 ++++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 746c4278ea5e..6e9ba51b23b9 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1120,6 +1120,7 @@ static int __init rcu_torture_stall_init(void)
 
 	if (stall_cpu <= 0)
 		return 0;
+	VERBOSE_TOROUT_STRING("Creating rcu_torture_stall task");
 	stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
 	if (IS_ERR(stall_task)) {
 		ret = PTR_ERR(stall_task);
@@ -1242,6 +1243,7 @@ static int rcu_torture_barrier_init(void)
 		return -ENOMEM;
 	for (i = 0; i < n_barrier_cbs; i++) {
 		init_waitqueue_head(&barrier_cbs_wq[i]);
+		VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier_cbs task");
 		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
 						   (void *)(long)i,
 						   "rcu_torture_barrier_cbs");
@@ -1253,6 +1255,7 @@ static int rcu_torture_barrier_init(void)
 		}
 		torture_shuffle_task_register(barrier_cbs_tasks[i]);
 	}
+	VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier task");
 	barrier_task = kthread_run(rcu_torture_barrier, NULL,
 				   "rcu_torture_barrier");
 	if (IS_ERR(barrier_task)) {
@@ -1606,6 +1609,7 @@ rcu_torture_init(void)
 		fqs_duration = 0;
 	if (fqs_duration) {
 		/* Create the fqs thread */
+		VERBOSE_TOROUT_STRING("Creating rcu_torture_fqs task");
 		fqs_task = kthread_run(rcu_torture_fqs, NULL,
 				       "rcu_torture_fqs");
 		if (IS_ERR(fqs_task)) {
diff --git a/kernel/torture.c b/kernel/torture.c
index df2c700e96e4..5e2838f902f9 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -187,6 +187,7 @@ int torture_onoff_init(long ooholdoff, long oointerval)
 	onoff_interval = oointerval;
 	if (onoff_interval <= 0)
 		return 0;
+	VERBOSE_TOROUT_STRING("Creating torture_onoff task");
 	onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff");
 	if (IS_ERR(onoff_task)) {
 		ret = PTR_ERR(onoff_task);
@@ -390,6 +391,7 @@ int torture_shuffle_init(long shuffint)
 	}
 
 	/* Create the shuffler thread */
+	VERBOSE_TOROUT_STRING("Creating torture_shuffle task");
 	shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle");
 	if (IS_ERR(shuffler_task)) {
 		ret = PTR_ERR(shuffler_task);
@@ -486,6 +488,7 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 	shutdown_secs = ssecs;
 	torture_shutdown_hook = cleanup;
 	if (shutdown_secs > 0) {
+		VERBOSE_TOROUT_STRING("Creating torture_shutdown task");
 		shutdown_time = jiffies + shutdown_secs * HZ;
 		shutdown_task = kthread_create(torture_shutdown, NULL,
 					       "torture_shutdown");
@@ -592,6 +595,7 @@ int torture_stutter_init(int s)
 	int ret;
 
 	stutter = s;
+	VERBOSE_TOROUT_STRING("Creating torture_stutter task");
 	stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter");
 	if (IS_ERR(stutter_task)) {
 		ret = PTR_ERR(stutter_task);
-- 
cgit v1.3-14-g43fede


From 7fafaac5b9ce22cc57777865390520476ad2262d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 31 Jan 2014 17:37:28 -0800
Subject: rcutorture: Fix rcutorture shutdown races

Not all of the rcutorture kthreads waited for kthread_should_stop()
before returning from their top-level functions, and none of them
used torture_shutdown_absorb() properly.  These problems can result in
segfaults and hangs at shutdown time, and some recent changes perturbed
timing sufficiently to make them much more probable.  This commit
therefore creates a torture_kthread_stopping() function that does the
proper kthread shutdown dance in one centralized location.

Accommodate this grouping by making VERBOSE_TOROUT_STRING() capable of
taking a non-const string as its argument, which allows the new
torture_kthread_stopping() to pass its "title" argument directly to
the updated version of VERBOSE_TOROUT_STRING().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/torture.h |  3 ++-
 kernel/rcu/rcutorture.c | 39 +++++++++++----------------------------
 kernel/torture.c        | 26 ++++++++++++++++++++++----
 3 files changed, 35 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index c79c41d543ef..2ea11094daf6 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -45,7 +45,7 @@
 #define TOROUT_STRING(s) \
 	pr_alert("%s" TORTURE_FLAG s "\n", torture_type)
 #define VERBOSE_TOROUT_STRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0)
 #define VERBOSE_TOROUT_ERRSTRING(s) \
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
 
@@ -88,5 +88,6 @@ void torture_init_end(void);
 bool torture_cleanup(void);
 bool torture_must_stop(void);
 bool torture_must_stop_irq(void);
+void torture_kthread_stopping(char *title);
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 6e9ba51b23b9..bcaafd6cf633 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -602,12 +602,13 @@ checkwait:	stutter_wait("rcu_torture_boost");
 	} while (!torture_must_stop());
 
 	/* Clean up and exit. */
-	VERBOSE_TOROUT_STRING("rcu_torture_boost task stopping");
-	torture_shutdown_absorb("rcu_torture_boost");
-	while (!kthread_should_stop() || rbi.inflight)
+	while (!kthread_should_stop() || rbi.inflight) {
+		torture_shutdown_absorb("rcu_torture_boost");
 		schedule_timeout_uninterruptible(1);
+	}
 	smp_mb(); /* order accesses to ->inflight before stack-frame death. */
 	destroy_rcu_head_on_stack(&rbi.rcu);
+	torture_kthread_stopping("rcu_torture_boost");
 	return 0;
 }
 
@@ -638,10 +639,7 @@ rcu_torture_fqs(void *arg)
 		}
 		stutter_wait("rcu_torture_fqs");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_fqs task stopping");
-	torture_shutdown_absorb("rcu_torture_fqs");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_fqs");
 	return 0;
 }
 
@@ -710,10 +708,7 @@ rcu_torture_writer(void *arg)
 		rcutorture_record_progress(++rcu_torture_current_version);
 		stutter_wait("rcu_torture_writer");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_writer task stopping");
-	torture_shutdown_absorb("rcu_torture_writer");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_writer");
 	return 0;
 }
 
@@ -748,10 +743,7 @@ rcu_torture_fakewriter(void *arg)
 		stutter_wait("rcu_torture_fakewriter");
 	} while (!torture_must_stop());
 
-	VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task stopping");
-	torture_shutdown_absorb("rcu_torture_fakewriter");
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_fakewriter");
 	return 0;
 }
 
@@ -892,12 +884,9 @@ rcu_torture_reader(void *arg)
 		schedule();
 		stutter_wait("rcu_torture_reader");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_reader task stopping");
-	torture_shutdown_absorb("rcu_torture_reader");
 	if (irqreader && cur_ops->irq_capable)
 		del_timer_sync(&t);
-	while (!kthread_should_stop())
-		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_reader");
 	return 0;
 }
 
@@ -1010,7 +999,7 @@ rcu_torture_stats(void *arg)
 		rcu_torture_stats_print();
 		torture_shutdown_absorb("rcu_torture_stats");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_stats task stopping");
+	torture_kthread_stopping("rcu_torture_stats");
 	return 0;
 }
 
@@ -1171,12 +1160,9 @@ static int rcu_torture_barrier_cbs(void *arg)
 		if (atomic_dec_and_test(&barrier_cbs_count))
 			wake_up(&barrier_wq);
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task stopping");
-	torture_shutdown_absorb("rcu_torture_barrier_cbs");
-	while (!kthread_should_stop())
-		schedule_timeout_interruptible(1);
 	cur_ops->cb_barrier();
 	destroy_rcu_head_on_stack(&rcu);
+	torture_kthread_stopping("rcu_torture_barrier_cbs");
 	return 0;
 }
 
@@ -1207,10 +1193,7 @@ static int rcu_torture_barrier(void *arg)
 		n_barrier_successes++;
 		schedule_timeout_interruptible(HZ / 10);
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("rcu_torture_barrier task stopping");
-	torture_shutdown_absorb("rcu_torture_barrier");
-	while (!kthread_should_stop())
-		schedule_timeout_interruptible(1);
+	torture_kthread_stopping("rcu_torture_barrier");
 	return 0;
 }
 
diff --git a/kernel/torture.c b/kernel/torture.c
index 5e2838f902f9..330576660cf4 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -169,7 +169,7 @@ torture_onoff(void *arg)
 		}
 		schedule_timeout_interruptible(onoff_interval);
 	}
-	VERBOSE_TOROUT_STRING("torture_onoff task stopping");
+	torture_kthread_stopping("torture_onoff");
 	return 0;
 }
 
@@ -370,7 +370,7 @@ static int torture_shuffle(void *arg)
 		torture_shuffle_tasks();
 		torture_shutdown_absorb("torture_shuffle");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("torture_shuffle task stopping");
+	torture_kthread_stopping("torture_shuffle");
 	return 0;
 }
 
@@ -465,7 +465,7 @@ static int torture_shutdown(void *arg)
 		jiffies_snap = jiffies;
 	}
 	if (torture_must_stop()) {
-		VERBOSE_TOROUT_STRING("torture_shutdown task stopping");
+		torture_kthread_stopping("torture_shutdown");
 		return 0;
 	}
 
@@ -583,7 +583,7 @@ static int torture_stutter(void *arg)
 		ACCESS_ONCE(stutter_pause_test) = 0;
 		torture_shutdown_absorb("torture_stutter");
 	} while (!torture_must_stop());
-	VERBOSE_TOROUT_STRING("torture_stutter task stopping");
+	torture_kthread_stopping("torture_stutter");
 	return 0;
 }
 
@@ -696,3 +696,21 @@ bool torture_must_stop_irq(void)
 	return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
 }
 EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue.  This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+	if (verbose)
+		VERBOSE_TOROUT_STRING(title);
+	while (!kthread_should_stop()) {
+		torture_shutdown_absorb(title);
+		schedule_timeout_uninterruptible(1);
+	}
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
-- 
cgit v1.3-14-g43fede


From bc8f83e2c0d585b201dfbb52e98f6f8741d324ea Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Feb 2014 10:02:41 -0800
Subject: rcutorture: Fix missing-return bug in rcu_torture_barrier_init()

This commit adds a missing error return to the code path that creates
the rcu_torture_barrier() kthread.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index bcaafd6cf633..25e9b16fe7f0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1245,6 +1245,7 @@ static int rcu_torture_barrier_init(void)
 		ret = PTR_ERR(barrier_task);
 		VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier");
 		barrier_task = NULL;
+		return ret;
 	}
 	torture_shuffle_task_register(barrier_task);
 	return 0;
-- 
cgit v1.3-14-g43fede


From 47cf29b9e721967aac95ebda9e50408219755852 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 3 Feb 2014 11:52:27 -0800
Subject: rcutorture: Abstract torture_create_kthread()

Creation of kthreads is not RCU-specific, so this commit abstracts
out torture_create_kthread(), saving a few tens of lines of code in
the process.

This change requires modifying VERBOSE_TOROUT_ERRSTRING() to take a
non-const string, so that _torture_create_kthread() can avoid an
open-coded substitute.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  8 +++-
 kernel/rcu/rcutorture.c | 98 ++++++++++---------------------------------------
 kernel/torture.c        | 80 +++++++++++++++++-----------------------
 3 files changed, 60 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 2ea11094daf6..430cc3008628 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -47,7 +47,7 @@
 #define VERBOSE_TOROUT_STRING(s) \
 	do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0)
 #define VERBOSE_TOROUT_ERRSTRING(s) \
-	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+	do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0)
 
 /* Definitions for a non-string torture-test module parameter. */
 #define torture_parm(type, name, init, msg) \
@@ -89,5 +89,11 @@ bool torture_cleanup(void);
 bool torture_must_stop(void);
 bool torture_must_stop_irq(void);
 void torture_kthread_stopping(char *title);
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+			     char *f, struct task_struct **tp);
+
+#define torture_create_kthread(n, arg, tp) \
+	_torture_create_kthread(n, (arg), #n, "Creating " #n " task", \
+				"Failed to create " #n, &(tp))
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 25e9b16fe7f0..a6f6c8418d87 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1105,19 +1105,9 @@ static int rcu_torture_stall(void *args)
 /* Spawn CPU-stall kthread, if stall_cpu specified. */
 static int __init rcu_torture_stall_init(void)
 {
-	int ret;
-
 	if (stall_cpu <= 0)
 		return 0;
-	VERBOSE_TOROUT_STRING("Creating rcu_torture_stall task");
-	stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
-	if (IS_ERR(stall_task)) {
-		ret = PTR_ERR(stall_task);
-		stall_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(stall_task);
-	return 0;
+	return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
 }
 
 /* Clean up after the CPU-stall kthread, if one was spawned. */
@@ -1226,29 +1216,13 @@ static int rcu_torture_barrier_init(void)
 		return -ENOMEM;
 	for (i = 0; i < n_barrier_cbs; i++) {
 		init_waitqueue_head(&barrier_cbs_wq[i]);
-		VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier_cbs task");
-		barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
-						   (void *)(long)i,
-						   "rcu_torture_barrier_cbs");
-		if (IS_ERR(barrier_cbs_tasks[i])) {
-			ret = PTR_ERR(barrier_cbs_tasks[i]);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
-			barrier_cbs_tasks[i] = NULL;
+		ret = torture_create_kthread(rcu_torture_barrier_cbs,
+					     (void *)(long)i,
+					     barrier_cbs_tasks[i]);
+		if (ret)
 			return ret;
-		}
-		torture_shuffle_task_register(barrier_cbs_tasks[i]);
 	}
-	VERBOSE_TOROUT_STRING("Creating rcu_torture_barrier task");
-	barrier_task = kthread_run(rcu_torture_barrier, NULL,
-				   "rcu_torture_barrier");
-	if (IS_ERR(barrier_task)) {
-		ret = PTR_ERR(barrier_task);
-		VERBOSE_TOROUT_ERRSTRING("Failed to create rcu_torture_barrier");
-		barrier_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(barrier_task);
-	return 0;
+	return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
 }
 
 /* Clean up after RCU barrier testing. */
@@ -1516,17 +1490,10 @@ rcu_torture_init(void)
 
 	/* Start up the kthreads. */
 
-	VERBOSE_TOROUT_STRING("Creating rcu_torture_writer task");
-	writer_task = kthread_create(rcu_torture_writer, NULL,
-				     "rcu_torture_writer");
-	if (IS_ERR(writer_task)) {
-		firsterr = PTR_ERR(writer_task);
-		VERBOSE_TOROUT_ERRSTRING("Failed to create writer");
-		writer_task = NULL;
+	firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+					  writer_task);
+	if (firsterr)
 		goto unwind;
-	}
-	torture_shuffle_task_register(writer_task);
-	wake_up_process(writer_task);
 	fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
 				   GFP_KERNEL);
 	if (fakewriter_tasks == NULL) {
@@ -1535,16 +1502,10 @@ rcu_torture_init(void)
 		goto unwind;
 	}
 	for (i = 0; i < nfakewriters; i++) {
-		VERBOSE_TOROUT_STRING("Creating rcu_torture_fakewriter task");
-		fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
-						  "rcu_torture_fakewriter");
-		if (IS_ERR(fakewriter_tasks[i])) {
-			firsterr = PTR_ERR(fakewriter_tasks[i]);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create fakewriter");
-			fakewriter_tasks[i] = NULL;
+		firsterr = torture_create_kthread(rcu_torture_fakewriter,
+						  NULL, fakewriter_tasks[i]);
+		if (firsterr)
 			goto unwind;
-		}
-		torture_shuffle_task_register(fakewriter_tasks[i]);
 	}
 	reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
@@ -1554,28 +1515,16 @@ rcu_torture_init(void)
 		goto unwind;
 	}
 	for (i = 0; i < nrealreaders; i++) {
-		VERBOSE_TOROUT_STRING("Creating rcu_torture_reader task");
-		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
-					      "rcu_torture_reader");
-		if (IS_ERR(reader_tasks[i])) {
-			firsterr = PTR_ERR(reader_tasks[i]);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create reader");
-			reader_tasks[i] = NULL;
+		firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+						  reader_tasks[i]);
+		if (firsterr)
 			goto unwind;
-		}
-		torture_shuffle_task_register(reader_tasks[i]);
 	}
 	if (stat_interval > 0) {
-		VERBOSE_TOROUT_STRING("Creating rcu_torture_stats task");
-		stats_task = kthread_run(rcu_torture_stats, NULL,
-					"rcu_torture_stats");
-		if (IS_ERR(stats_task)) {
-			firsterr = PTR_ERR(stats_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create stats");
-			stats_task = NULL;
+		firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+						  stats_task);
+		if (firsterr)
 			goto unwind;
-		}
-		torture_shuffle_task_register(stats_task);
 	}
 	if (test_no_idle_hz) {
 		firsterr = torture_shuffle_init(shuffle_interval * HZ);
@@ -1593,16 +1542,9 @@ rcu_torture_init(void)
 		fqs_duration = 0;
 	if (fqs_duration) {
 		/* Create the fqs thread */
-		VERBOSE_TOROUT_STRING("Creating rcu_torture_fqs task");
-		fqs_task = kthread_run(rcu_torture_fqs, NULL,
-				       "rcu_torture_fqs");
-		if (IS_ERR(fqs_task)) {
-			firsterr = PTR_ERR(fqs_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create fqs");
-			fqs_task = NULL;
+		torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
+		if (firsterr)
 			goto unwind;
-		}
-		torture_shuffle_task_register(fqs_task);
 	}
 	if (test_boost_interval < 1)
 		test_boost_interval = 1;
diff --git a/kernel/torture.c b/kernel/torture.c
index 330576660cf4..439451821a7f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -180,23 +180,16 @@ torture_onoff(void *arg)
  */
 int torture_onoff_init(long ooholdoff, long oointerval)
 {
-#ifdef CONFIG_HOTPLUG_CPU
-	int ret;
+	int ret = 0;
 
+#ifdef CONFIG_HOTPLUG_CPU
 	onoff_holdoff = ooholdoff;
 	onoff_interval = oointerval;
 	if (onoff_interval <= 0)
 		return 0;
-	VERBOSE_TOROUT_STRING("Creating torture_onoff task");
-	onoff_task = kthread_run(torture_onoff, NULL, "torture_onoff");
-	if (IS_ERR(onoff_task)) {
-		ret = PTR_ERR(onoff_task);
-		onoff_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(onoff_task);
+	ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(torture_onoff_init);
 
@@ -379,8 +372,6 @@ static int torture_shuffle(void *arg)
  */
 int torture_shuffle_init(long shuffint)
 {
-	int ret;
-
 	shuffle_interval = shuffint;
 
 	shuffle_idle_cpu = -1;
@@ -391,17 +382,7 @@ int torture_shuffle_init(long shuffint)
 	}
 
 	/* Create the shuffler thread */
-	VERBOSE_TOROUT_STRING("Creating torture_shuffle task");
-	shuffler_task = kthread_run(torture_shuffle, NULL, "torture_shuffle");
-	if (IS_ERR(shuffler_task)) {
-		ret = PTR_ERR(shuffler_task);
-		free_cpumask_var(shuffle_tmp_mask);
-		VERBOSE_TOROUT_ERRSTRING("Failed to create shuffler");
-		shuffler_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(shuffler_task);
-	return 0;
+	return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
 }
 EXPORT_SYMBOL_GPL(torture_shuffle_init);
 
@@ -483,25 +464,16 @@ static int torture_shutdown(void *arg)
  */
 int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 {
-	int ret;
+	int ret = 0;
 
 	shutdown_secs = ssecs;
 	torture_shutdown_hook = cleanup;
 	if (shutdown_secs > 0) {
-		VERBOSE_TOROUT_STRING("Creating torture_shutdown task");
 		shutdown_time = jiffies + shutdown_secs * HZ;
-		shutdown_task = kthread_create(torture_shutdown, NULL,
-					       "torture_shutdown");
-		if (IS_ERR(shutdown_task)) {
-			ret = PTR_ERR(shutdown_task);
-			VERBOSE_TOROUT_ERRSTRING("Failed to create shutdown");
-			shutdown_task = NULL;
-			return ret;
-		}
-		torture_shuffle_task_register(shutdown_task);
-		wake_up_process(shutdown_task);
+		ret = torture_create_kthread(torture_shutdown, NULL,
+					     shutdown_task);
 	}
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_init);
 
@@ -595,16 +567,8 @@ int torture_stutter_init(int s)
 	int ret;
 
 	stutter = s;
-	VERBOSE_TOROUT_STRING("Creating torture_stutter task");
-	stutter_task = kthread_run(torture_stutter, NULL, "torture_stutter");
-	if (IS_ERR(stutter_task)) {
-		ret = PTR_ERR(stutter_task);
-		VERBOSE_TOROUT_ERRSTRING("Failed to create stutter");
-		stutter_task = NULL;
-		return ret;
-	}
-	torture_shuffle_task_register(stutter_task);
-	return 0;
+	ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(torture_stutter_init);
 
@@ -714,3 +678,25 @@ void torture_kthread_stopping(char *title)
 	}
 }
 EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+
+/*
+ * Create a generic torture kthread that is immediately runnable.  If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+			    char *f, struct task_struct **tp)
+{
+	int ret = 0;
+
+	VERBOSE_TOROUT_STRING(m);
+	*tp = kthread_run(fn, arg, s);
+	if (IS_ERR(*tp)) {
+		ret = PTR_ERR(*tp);
+		VERBOSE_TOROUT_ERRSTRING(f);
+		*tp = NULL;
+	}
+	torture_shuffle_task_register(*tp);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
-- 
cgit v1.3-14-g43fede


From 9c029b86098decd4660eec511b8d2d42da3e7dd9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 4 Feb 2014 11:47:08 -0800
Subject: rcutorture: Abstract torture_stop_kthread()

Stopping of kthreads is not RCU-specific, so this commit abstracts
out torture_stop_kthread(), saving a few lines of code in the process.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  3 +++
 kernel/rcu/rcutorture.c | 72 ++++++++++---------------------------------------
 kernel/torture.c        | 13 +++++++++
 3 files changed, 30 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 430cc3008628..7ccfb0a16728 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -91,9 +91,12 @@ bool torture_must_stop_irq(void);
 void torture_kthread_stopping(char *title);
 int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
 			     char *f, struct task_struct **tp);
+void _torture_stop_kthread(char *m, struct task_struct **tp);
 
 #define torture_create_kthread(n, arg, tp) \
 	_torture_create_kthread(n, (arg), #n, "Creating " #n " task", \
 				"Failed to create " #n, &(tp))
+#define torture_stop_kthread(n, tp) \
+	_torture_stop_kthread("Stopping " #n " task", &(tp))
 
 #endif /* __LINUX_TORTURE_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index a6f6c8418d87..37bd4beea198 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1033,14 +1033,12 @@ static void rcutorture_booster_cleanup(int cpu)
 	if (boost_tasks[cpu] == NULL)
 		return;
 	mutex_lock(&boost_mutex);
-	VERBOSE_TOROUT_STRING("Stopping rcu_torture_boost task");
 	t = boost_tasks[cpu];
 	boost_tasks[cpu] = NULL;
 	mutex_unlock(&boost_mutex);
 
 	/* This must be outside of the mutex, otherwise deadlock! */
-	kthread_stop(t);
-	boost_tasks[cpu] = NULL;
+	torture_stop_kthread(rcu_torture_boost, t);
 }
 
 static int rcutorture_booster_init(int cpu)
@@ -1110,16 +1108,6 @@ static int __init rcu_torture_stall_init(void)
 	return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
 }
 
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
-	if (stall_task == NULL)
-		return;
-	VERBOSE_TOROUT_STRING("Stopping rcu_torture_stall_task.");
-	kthread_stop(stall_task);
-	stall_task = NULL;
-}
-
 /* Callback function for RCU barrier testing. */
 void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 {
@@ -1230,19 +1218,11 @@ static void rcu_torture_barrier_cleanup(void)
 {
 	int i;
 
-	if (barrier_task != NULL) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier task");
-		kthread_stop(barrier_task);
-		barrier_task = NULL;
-	}
+	torture_stop_kthread(rcu_torture_barrier, barrier_task);
 	if (barrier_cbs_tasks != NULL) {
-		for (i = 0; i < n_barrier_cbs; i++) {
-			if (barrier_cbs_tasks[i] != NULL) {
-				VERBOSE_TOROUT_STRING("Stopping rcu_torture_barrier_cbs task");
-				kthread_stop(barrier_cbs_tasks[i]);
-				barrier_cbs_tasks[i] = NULL;
-			}
-		}
+		for (i = 0; i < n_barrier_cbs; i++)
+			torture_stop_kthread(rcu_torture_barrier_cbs,
+					     barrier_cbs_tasks[i]);
 		kfree(barrier_cbs_tasks);
 		barrier_cbs_tasks = NULL;
 	}
@@ -1288,53 +1268,29 @@ rcu_torture_cleanup(void)
 	}
 
 	rcu_torture_barrier_cleanup();
-	rcu_torture_stall_cleanup();
+	torture_stop_kthread(rcu_torture_stall, stall_task);
 	torture_stutter_cleanup();
-
-	if (writer_task) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_writer task");
-		kthread_stop(writer_task);
-	}
-	writer_task = NULL;
+	torture_stop_kthread(rcu_torture_writer, writer_task);
 
 	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++) {
-			if (reader_tasks[i]) {
-				VERBOSE_TOROUT_STRING(
-					"Stopping rcu_torture_reader task");
-				kthread_stop(reader_tasks[i]);
-			}
-			reader_tasks[i] = NULL;
-		}
+		for (i = 0; i < nrealreaders; i++)
+			torture_stop_kthread(rcu_torture_reader,
+					     reader_tasks[i]);
 		kfree(reader_tasks);
-		reader_tasks = NULL;
 	}
 	rcu_torture_current = NULL;
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {
-			if (fakewriter_tasks[i]) {
-				VERBOSE_TOROUT_STRING(
-					"Stopping rcu_torture_fakewriter task");
-				kthread_stop(fakewriter_tasks[i]);
-			}
-			fakewriter_tasks[i] = NULL;
+			torture_stop_kthread(rcu_torture_fakewriter,
+					     fakewriter_tasks[i]);
 		}
 		kfree(fakewriter_tasks);
 		fakewriter_tasks = NULL;
 	}
 
-	if (stats_task) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_stats task");
-		kthread_stop(stats_task);
-	}
-	stats_task = NULL;
-
-	if (fqs_task) {
-		VERBOSE_TOROUT_STRING("Stopping rcu_torture_fqs task");
-		kthread_stop(fqs_task);
-	}
-	fqs_task = NULL;
+	torture_stop_kthread(rcu_torture_stats, stats_task);
+	torture_stop_kthread(rcu_torture_fqs, fqs_task);
 	if ((test_boost == 1 && cur_ops->can_boost) ||
 	    test_boost == 2) {
 		unregister_cpu_notifier(&rcutorture_cpu_nb);
diff --git a/kernel/torture.c b/kernel/torture.c
index 439451821a7f..871f63611f7f 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -700,3 +700,16 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(_torture_create_kthread);
+
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+	if (*tp == NULL)
+		return;
+	VERBOSE_TOROUT_STRING(m);
+	kthread_stop(*tp);
+	*tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
-- 
cgit v1.3-14-g43fede


From bfefc73aa1d1bad317bccef8a15da39263d3d962 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 4 Feb 2014 12:35:27 -0800
Subject: rcutorture: Stop generic kthreads in torture_cleanup()

The specific torture modules (like rcutorture) need to call
torture_cleanup() in any case, so this commit makes torture_cleanup()
deal with torture_shutdown_cleanup() and torture_stutter_cleanup() so
that the specific modules don't have to deal with these details.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/torture.h |  2 --
 kernel/rcu/rcutorture.c |  7 -------
 kernel/torture.c        | 37 +++++++++++++++++++------------------
 3 files changed, 19 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/torture.h b/include/linux/torture.h
index 7ccfb0a16728..b2e2b468e511 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -75,12 +75,10 @@ int torture_shuffle_init(long shuffint);
 /* Test auto-shutdown handling. */
 void torture_shutdown_absorb(const char *title);
 int torture_shutdown_init(int ssecs, void (*cleanup)(void));
-void torture_shutdown_cleanup(void);
 
 /* Task stuttering, which forces load/no-load transitions. */
 void stutter_wait(const char *title);
 int torture_stutter_init(int s);
-void torture_stutter_cleanup(void);
 
 /* Initialization and cleanup. */
 void torture_init_begin(char *ttype, bool v, int *runnable);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 37bd4beea198..40792e76a116 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -53,11 +53,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
 
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutorture."
 
 torture_param(int, fqs_duration, 0,
 	      "Duration of fqs bursts (us), 0 to disable");
@@ -1269,7 +1264,6 @@ rcu_torture_cleanup(void)
 
 	rcu_torture_barrier_cleanup();
 	torture_stop_kthread(rcu_torture_stall, stall_task);
-	torture_stutter_cleanup();
 	torture_stop_kthread(rcu_torture_writer, writer_task);
 
 	if (reader_tasks) {
@@ -1297,7 +1291,6 @@ rcu_torture_cleanup(void)
 		for_each_possible_cpu(i)
 			rcutorture_booster_cleanup(i);
 	}
-	torture_shutdown_cleanup();
 
 	/* Wait for all RCU callbacks to fire.  */
 
diff --git a/kernel/torture.c b/kernel/torture.c
index 871f63611f7f..b26c7b42becd 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -477,20 +477,6 @@ int torture_shutdown_init(int ssecs, void (*cleanup)(void))
 }
 EXPORT_SYMBOL_GPL(torture_shutdown_init);
 
-/*
- * Shut down the shutdown task.  Say what???  Heh!  This can happen if
- * the torture module gets an rmmod before the shutdown time arrives.  ;-)
- */
-void torture_shutdown_cleanup(void)
-{
-	if (shutdown_task != NULL) {
-		VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
-		kthread_stop(shutdown_task);
-	}
-	shutdown_task = NULL;
-}
-EXPORT_SYMBOL_GPL(torture_shutdown_cleanup);
-
 /*
  * Detect and respond to a system shutdown.
  */
@@ -512,6 +498,20 @@ static struct notifier_block torture_shutdown_nb = {
 	.notifier_call = torture_shutdown_notify,
 };
 
+/*
+ * Shut down the shutdown task.  Say what???  Heh!  This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives.  ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+	unregister_reboot_notifier(&torture_shutdown_nb);
+	if (shutdown_task != NULL) {
+		VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+		kthread_stop(shutdown_task);
+	}
+	shutdown_task = NULL;
+}
+
 /*
  * Variables for stuttering, which means to periodically pause and
  * restart testing in order to catch bugs that appear when load is
@@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(torture_stutter_init);
 /*
  * Cleanup after the torture_stutter kthread.
  */
-void torture_stutter_cleanup(void)
+static void torture_stutter_cleanup(void)
 {
 	if (!stutter_task)
 		return;
@@ -583,7 +583,6 @@ void torture_stutter_cleanup(void)
 	kthread_stop(stutter_task);
 	stutter_task = NULL;
 }
-EXPORT_SYMBOL_GPL(torture_stutter_cleanup);
 
 /*
  * Initialize torture module.  Please note that this is -not- invoked via
@@ -619,7 +618,8 @@ EXPORT_SYMBOL_GPL(torture_init_end);
  * Clean up torture module.  Please note that this is -not- invoked via
  * the usual module_exit() mechanism, but rather by an explicit call from
  * the client torture module.  Returns true if a race with system shutdown
- * is detected.
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
  *
  * This must be called before the caller starts shutting down its own
  * kthreads.
@@ -635,8 +635,9 @@ bool torture_cleanup(void)
 	}
 	ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
 	mutex_unlock(&fullstop_mutex);
-	unregister_reboot_notifier(&torture_shutdown_nb);
+	torture_shutdown_cleanup();
 	torture_shuffle_cleanup();
+	torture_stutter_cleanup();
 	torture_onoff_cleanup();
 	return false;
 }
-- 
cgit v1.3-14-g43fede


From 0af3fe1efa534a43385fe2694c42ffec7a310e46 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 4 Feb 2014 15:51:41 -0800
Subject: locktorture: Add a lock-torture kernel module

This commit adds the locking counterpart to rcutorture.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Make n_lock_torture_errors and torture_spinlock static
  as suggested by Fengguang Wu. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/Makefile      |   1 +
 kernel/locking/locktorture.c | 421 +++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug            |  15 ++
 3 files changed, 437 insertions(+)
 create mode 100644 kernel/locking/locktorture.c

(limited to 'kernel')

diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..a28ea6d9e6e8 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 000000000000..d69d20d9c9db
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,421 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ *	Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+	     "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+	     "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+	     "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+	     "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+	     "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+		 "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+
+static atomic_t n_lock_torture_errors;
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+
+struct lock_writer_stress_stats {
+	long n_write_lock_fail;
+	long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+
+#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+	void (*init)(void);
+	int (*writelock)(void);
+	void (*write_delay)(struct torture_random_state *trsp);
+	void (*writeunlock)(void);
+	unsigned long flags;
+	const char *name;
+};
+
+static struct lock_torture_ops *cur_ops;
+
+/*
+ * Definitions for lock torture testing.
+ */
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+	spin_lock(&torture_spinlock);
+	return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+	const unsigned long shortdelay_us = 2;
+	const unsigned long longdelay_us = 100;
+
+	/* We want a short delay mostly to emulate likely code, and
+	 * we want a long delay occasionally to force massive contention.
+	 */
+	if (!(torture_random(trsp) %
+	      (nrealwriters_stress * 2000 * longdelay_us)))
+		mdelay(longdelay_us);
+	if (!(torture_random(trsp) %
+	      (nrealwriters_stress * 2 * shortdelay_us)))
+		udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+		preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+	spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+	.writelock	= torture_spin_lock_write_lock,
+	.write_delay	= torture_spin_lock_write_delay,
+	.writeunlock	= torture_spin_lock_write_unlock,
+	.name		= "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&torture_spinlock, flags);
+	cur_ops->flags = flags;
+	return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+	spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+	.writelock	= torture_spin_lock_write_lock_irq,
+	.write_delay	= torture_spin_lock_write_delay,
+	.writeunlock	= torture_lock_spin_write_unlock_irq,
+	.name		= "spin_lock_irq"
+};
+
+/*
+ * Lock torture writer kthread.  Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+	struct lock_writer_stress_stats *lwsp = arg;
+	static DEFINE_TORTURE_RANDOM(rand);
+
+	VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+	set_user_nice(current, 19);
+
+	do {
+		schedule_timeout_uninterruptible(1);
+		cur_ops->writelock();
+		if (WARN_ON_ONCE(lock_is_write_held))
+			lwsp->n_write_lock_fail++;
+		lock_is_write_held = 1;
+		lwsp->n_write_lock_acquired++;
+		cur_ops->write_delay(&rand);
+		lock_is_write_held = 0;
+		cur_ops->writeunlock();
+		stutter_wait("lock_torture_writer");
+	} while (!torture_must_stop());
+	torture_kthread_stopping("lock_torture_writer");
+	return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+	bool fail = 0;
+	int i;
+	long max = 0;
+	long min = lwsa[0].n_write_lock_acquired;
+	long long sum = 0;
+
+	for (i = 0; i < nrealwriters_stress; i++) {
+		if (lwsa[i].n_write_lock_fail)
+			fail = true;
+		sum += lwsa[i].n_write_lock_acquired;
+		if (max < lwsa[i].n_write_lock_fail)
+			max = lwsa[i].n_write_lock_fail;
+		if (min > lwsa[i].n_write_lock_fail)
+			min = lwsa[i].n_write_lock_fail;
+	}
+	page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+	page += sprintf(page,
+			"Writes:  Total: %lld  Max/Min: %ld/%ld %s  Fail: %d %s\n",
+			sum, max, min, max / 2 > min ? "???" : "",
+			fail, fail ? "!!!" : "");
+	if (fail)
+		atomic_inc(&n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only one
+ * call to this function at a given time!!!  This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+	int size = nrealwriters_stress * 200 + 8192;
+	char *buf;
+
+	buf = kmalloc(size, GFP_KERNEL);
+	if (!buf) {
+		pr_err("lock_torture_stats_print: Out of memory, need: %d",
+		       size);
+		return;
+	}
+	lock_torture_printk(buf);
+	pr_alert("%s", buf);
+	kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+	VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		lock_torture_stats_print();
+		torture_shutdown_absorb("lock_torture_stats");
+	} while (!torture_must_stop());
+	torture_kthread_stopping("lock_torture_stats");
+	return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+				const char *tag)
+{
+	pr_alert("%s" TORTURE_FLAG
+		 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+		 torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+		 shuffle_interval, stutter, shutdown_secs,
+		 onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup())
+		return;
+
+	if (writer_tasks) {
+		for (i = 0; i < nrealwriters_stress; i++)
+			torture_stop_kthread(lock_torture_writer,
+					     writer_tasks[i]);
+		kfree(writer_tasks);
+		writer_tasks = NULL;
+	}
+
+	torture_stop_kthread(lock_torture_stats, stats_task);
+	lock_torture_stats_print();  /* -After- the stats thread is stopped! */
+
+	if (atomic_read(&n_lock_torture_errors))
+		lock_torture_print_module_parms(cur_ops,
+						"End of test: FAILURE");
+	else if (torture_onoff_failures())
+		lock_torture_print_module_parms(cur_ops,
+						"End of test: LOCK_HOTPLUG");
+	else
+		lock_torture_print_module_parms(cur_ops,
+						"End of test: SUCCESS");
+}
+
+static int __init lock_torture_init(void)
+{
+	int i;
+	int firsterr = 0;
+	static struct lock_torture_ops *torture_ops[] = {
+		&spin_lock_ops, &spin_lock_irq_ops,
+	};
+
+	torture_init_begin(torture_type, verbose, &locktorture_runnable);
+
+	/* Process args and tell the world that the torturer is on the job. */
+	for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+		cur_ops = torture_ops[i];
+		if (strcmp(torture_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(torture_ops)) {
+		pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+			 torture_type);
+		pr_alert("lock-torture types:");
+		for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+			pr_alert(" %s", torture_ops[i]->name);
+		pr_alert("\n");
+		torture_init_end();
+		return -EINVAL;
+	}
+	if (cur_ops->init)
+		cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+	if (nwriters_stress >= 0)
+		nrealwriters_stress = nwriters_stress;
+	else
+		nrealwriters_stress = 2 * num_online_cpus();
+	lock_torture_print_module_parms(cur_ops, "Start of test");
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	lock_is_write_held = 0;
+	lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+	if (lwsa == NULL) {
+		VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealwriters_stress; i++) {
+		lwsa[i].n_write_lock_fail = 0;
+		lwsa[i].n_write_lock_acquired = 0;
+	}
+
+	/* Start up the kthreads. */
+
+	if (onoff_interval > 0) {
+		firsterr = torture_onoff_init(onoff_holdoff * HZ,
+					      onoff_interval * HZ);
+		if (firsterr)
+			goto unwind;
+	}
+	if (shuffle_interval > 0) {
+		firsterr = torture_shuffle_init(shuffle_interval);
+		if (firsterr)
+			goto unwind;
+	}
+	if (shutdown_secs > 0) {
+		firsterr = torture_shutdown_init(shutdown_secs,
+						 lock_torture_cleanup);
+		if (firsterr)
+			goto unwind;
+	}
+	if (stutter > 0) {
+		firsterr = torture_stutter_init(stutter);
+		if (firsterr)
+			goto unwind;
+	}
+
+	writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+			       GFP_KERNEL);
+	if (writer_tasks == NULL) {
+		VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealwriters_stress; i++) {
+		firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+						  writer_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	if (stat_interval > 0) {
+		firsterr = torture_create_kthread(lock_torture_stats, NULL,
+						  stats_task);
+		if (firsterr)
+			goto unwind;
+	}
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	lock_torture_cleanup();
+	return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2bfb4e5cdf8c..dd7f8858188a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -980,6 +980,21 @@ config DEBUG_LOCKING_API_SELFTESTS
 	  The following locking APIs are covered: spinlocks, rwlocks,
 	  mutexes and rwsems.
 
+config LOCK_TORTURE_TEST
+	tristate "torture tests for locking"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	default n
+	help
+	  This option provides a kernel module that runs torture tests
+	  on kernel locking primitives.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want kernel locking-primitive torture tests
+	  to be built into the kernel.
+	  Say M if you want these torture tests to build as a module.
+	  Say N if you are unsure.
+
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
-- 
cgit v1.3-14-g43fede


From ff20e251c409da81f2b850c81964908fb4c6fe66 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 Feb 2014 08:45:56 -0800
Subject: rcutorture: Add an rcu_busted to test the test

This commit adds a deliberately buggy RCU implementation into rcutorture
to allow easy checking that rcutorture correctly flags buggy RCU
implementations.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/rcu/rcutorture.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 40792e76a116..da6c38d909f1 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -371,6 +371,48 @@ static struct rcu_torture_ops rcu_bh_ops = {
 	.name		= "rcu_bh"
 };
 
+/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+	/* This is a deliberate bug for testing purposes only! */
+	rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+	/* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	/* This is a deliberate bug for testing purposes only! */
+	func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+	.init		= rcu_sync_torture_init,
+	.readlock	= rcu_torture_read_lock,
+	.read_delay	= rcu_read_delay,  /* just reuse rcu's version. */
+	.readunlock	= rcu_torture_read_unlock,
+	.completed	= rcu_no_completed,
+	.deferred_free	= rcu_busted_torture_deferred_free,
+	.sync		= synchronize_rcu_busted,
+	.exp_sync	= synchronize_rcu_busted,
+	.call		= call_rcu_busted,
+	.cb_barrier	= NULL,
+	.fqs		= NULL,
+	.stats		= NULL,
+	.irq_capable	= 1,
+	.name		= "rcu_busted"
+};
+
 /*
  * Definitions for srcu torture testing.
  */
@@ -1371,7 +1413,7 @@ rcu_torture_init(void)
 	int cpu;
 	int firsterr = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
-		&rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+		&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
 	};
 
 	torture_init_begin(torture_type, verbose, &rcutorture_runnable);
-- 
cgit v1.3-14-g43fede


From f881825a73543e9664c7fe7166e06f5f4d569834 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Feb 2014 14:42:51 -0800
Subject: rcutorture: Gracefully handle NULL cleanup hooks

Although most torture tests will have some cleanup hook, it is possible
that one might not.  This commit therefore enables graceful handling of
a NULL cleanup hook during torture-test shutdown.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/torture.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/torture.c b/kernel/torture.c
index b26c7b42becd..acc9afc2f26e 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -454,7 +454,10 @@ static int torture_shutdown(void *arg)
 
 	VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
 	shutdown_task = NULL;	/* Avoid self-kill deadlock. */
-	torture_shutdown_hook();/* Shut down the enclosing torture test. */
+	if (torture_shutdown_hook)
+		torture_shutdown_hook();
+	else
+		VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
 	kernel_power_off();	/* Shut down the system. */
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From e086481baf9d0436bdd6e9b739bfa4a83fb89ef5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 11 Feb 2014 08:05:07 -0800
Subject: rcutorture: Add a lock_busted to test the test

This commit adds a maximally broken locking primitive in which
lock acquisition and release are both no-ops.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/locking/locktorture.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index d69d20d9c9db..f26b1a18e34e 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -112,6 +112,37 @@ static struct lock_torture_ops *cur_ops;
  * Definitions for lock torture testing.
  */
 
+static int torture_lock_busted_write_lock(void)
+{
+	return 0;  /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+	const unsigned long longdelay_us = 100;
+
+	/* We want a long delay occasionally to force massive contention.  */
+	if (!(torture_random(trsp) %
+	      (nrealwriters_stress * 2000 * longdelay_us)))
+		mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+	if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+		preempt_schedule();  /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+	  /* BUGGY, do not use in real life!!! */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+	.writelock	= torture_lock_busted_write_lock,
+	.write_delay	= torture_lock_busted_write_delay,
+	.writeunlock	= torture_lock_busted_write_unlock,
+	.name		= "lock_busted"
+};
+
 static DEFINE_SPINLOCK(torture_spinlock);
 
 static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
@@ -320,7 +351,7 @@ static int __init lock_torture_init(void)
 	int i;
 	int firsterr = 0;
 	static struct lock_torture_ops *torture_ops[] = {
-		&spin_lock_ops, &spin_lock_irq_ops,
+		&lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
 	};
 
 	torture_init_begin(torture_type, verbose, &locktorture_runnable);
-- 
cgit v1.3-14-g43fede


From f5645d3575fe9a2c468889146e152f11c199826e Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 21 Feb 2014 14:19:30 -0800
Subject: capability: Use current logging styles

Prefix logging output with "capability: " via pr_fmt.
Convert printks to pr_<level>.
Use pr_<level>_once instead of guard flags.
Coalesce formats.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 kernel/capability.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/capability.c b/kernel/capability.c
index 4e66bf9275b0..d6a6c91863ff 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -7,6 +7,8 @@
  * 30 May 2002:	Cleanup, Robert M. Love <rml@tech9.net>
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
@@ -42,15 +44,10 @@ __setup("no_file_caps", file_caps_disable);
 
 static void warn_legacy_capability_use(void)
 {
-	static int warned;
-	if (!warned) {
-		char name[sizeof(current->comm)];
-
-		printk(KERN_INFO "warning: `%s' uses 32-bit capabilities"
-		       " (legacy support in use)\n",
-		       get_task_comm(name, current));
-		warned = 1;
-	}
+	char name[sizeof(current->comm)];
+
+	pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
+		     get_task_comm(name, current));
 }
 
 /*
@@ -71,16 +68,10 @@ static void warn_legacy_capability_use(void)
 
 static void warn_deprecated_v2(void)
 {
-	static int warned;
+	char name[sizeof(current->comm)];
 
-	if (!warned) {
-		char name[sizeof(current->comm)];
-
-		printk(KERN_INFO "warning: `%s' uses deprecated v2"
-		       " capabilities in a way that may be insecure.\n",
-		       get_task_comm(name, current));
-		warned = 1;
-	}
+	pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
+		     get_task_comm(name, current));
 }
 
 /*
@@ -380,7 +371,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
 bool ns_capable(struct user_namespace *ns, int cap)
 {
 	if (unlikely(!cap_valid(cap))) {
-		printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
+		pr_crit("capable() called with invalid cap=%u\n", cap);
 		BUG();
 	}
 
-- 
cgit v1.3-14-g43fede


From 5fd77595ec62141fa71e575bdbf410e0192f87d0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 24 Feb 2014 16:39:55 +0100
Subject: smp: Iterate functions through llist_for_each_entry_safe()

The IPI function llist iteration is open coded. Lets simplify this
with using an llist iterator.

Also we want to keep the iteration safe against possible
csd.llist->next value reuse from the IPI handler. At least the block
subsystem used to do such things so lets stay careful and use
llist_for_each_entry_safe().

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/smp.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index ffee35bef179..e3852de042a6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -151,7 +151,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
  */
 void generic_smp_call_function_single_interrupt(void)
 {
-	struct llist_node *entry, *next;
+	struct llist_node *entry;
+	struct call_single_data *csd, *csd_next;
 
 	/*
 	 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -161,16 +162,9 @@ void generic_smp_call_function_single_interrupt(void)
 	entry = llist_del_all(&__get_cpu_var(call_single_queue));
 	entry = llist_reverse_order(entry);
 
-	while (entry) {
-		struct call_single_data *csd;
-
-		next = entry->next;
-
-		csd = llist_entry(entry, struct call_single_data, llist);
+	llist_for_each_entry_safe(csd, csd_next, entry, llist) {
 		csd->func(csd->info);
 		csd_unlock(csd);
-
-		entry = next;
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 08eed44c7249d381a099bc55577e55c6bb533160 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 24 Feb 2014 16:39:57 +0100
Subject: smp: Teach __smp_call_function_single() to check for offline cpus

Align __smp_call_function_single() with smp_call_function_single() so
that it also checks whether requested cpu is still online.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 include/linux/smp.h |  3 +--
 kernel/smp.c        | 11 +++++++----
 kernel/up.c         |  5 +++--
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 4991c6b12831..c39074c794c5 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,8 +50,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 		smp_call_func_t func, void *info, bool wait,
 		gfp_t gfp_flags);
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data,
-				int wait);
+int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait);
 
 #ifdef CONFIG_SMP
 
diff --git a/kernel/smp.c b/kernel/smp.c
index e3852de042a6..5ff14e3739ca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -276,18 +276,18 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
 /**
  * __smp_call_function_single(): Run a function on a specific CPU
  * @cpu: The CPU to run on.
- * @data: Pre-allocated and setup data structure
+ * @csd: Pre-allocated and setup data structure
  * @wait: If true, wait until function has completed on specified CPU.
  *
  * Like smp_call_function_single(), but allow caller to pass in a
  * pre-allocated data structure. Useful for embedding @data inside
  * other structures, for instance.
  */
-void __smp_call_function_single(int cpu, struct call_single_data *csd,
-				int wait)
+int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
 {
 	unsigned int this_cpu;
 	unsigned long flags;
+	int err = 0;
 
 	this_cpu = get_cpu();
 	/*
@@ -303,11 +303,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *csd,
 		local_irq_save(flags);
 		csd->func(csd->info);
 		local_irq_restore(flags);
-	} else {
+	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
 		csd_lock(csd);
 		generic_exec_single(cpu, csd, wait);
+	} else {
+		err = -ENXIO;	/* CPU not online */
 	}
 	put_cpu();
+	return err;
 }
 EXPORT_SYMBOL_GPL(__smp_call_function_single);
 
diff --git a/kernel/up.c b/kernel/up.c
index 509403e3fbc6..cdf03d16840e 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,14 +22,15 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-void __smp_call_function_single(int cpu, struct call_single_data *csd,
-				int wait)
+int __smp_call_function_single(int cpu, struct call_single_data *csd,
+			       int wait)
 {
 	unsigned long flags;
 
 	local_irq_save(flags);
 	csd->func(csd->info);
 	local_irq_restore(flags);
+	return 0;
 }
 EXPORT_SYMBOL(__smp_call_function_single);
 
-- 
cgit v1.3-14-g43fede


From 8b28499a71d3431c9128abc743e2d2bfbdae3ed4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 24 Feb 2014 16:39:58 +0100
Subject: smp: Consolidate the various smp_call_function_single() declensions

__smp_call_function_single() and smp_call_function_single() share some
code that can be factorized: execute inline when the target is local,
check if the target is online, lock the csd, call generic_exec_single().

Lets move the common parts to generic_exec_single().

Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/smp.c | 80 +++++++++++++++++++++++++++++-------------------------------
 1 file changed, 39 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 5ff14e3739ca..64bb0d48e96f 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -117,13 +117,43 @@ static void csd_unlock(struct call_single_data *csd)
 	csd->flags &= ~CSD_FLAG_LOCK;
 }
 
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
+
 /*
  * Insert a previously allocated call_single_data element
  * for execution on the given CPU. data must already have
  * ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
+static int generic_exec_single(int cpu, struct call_single_data *csd,
+			       smp_call_func_t func, void *info, int wait)
 {
+	struct call_single_data csd_stack = { .flags = 0 };
+	unsigned long flags;
+
+
+	if (cpu == smp_processor_id()) {
+		local_irq_save(flags);
+		func(info);
+		local_irq_restore(flags);
+		return 0;
+	}
+
+
+	if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu))
+		return -ENXIO;
+
+
+	if (!csd) {
+		csd = &csd_stack;
+		if (!wait)
+			csd = &__get_cpu_var(csd_data);
+	}
+
+	csd_lock(csd);
+
+	csd->func = func;
+	csd->info = info;
+
 	if (wait)
 		csd->flags |= CSD_FLAG_WAIT;
 
@@ -143,6 +173,8 @@ static void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
 
 	if (wait)
 		csd_lock_wait(csd);
+
+	return 0;
 }
 
 /*
@@ -168,8 +200,6 @@ void generic_smp_call_function_single_interrupt(void)
 	}
 }
 
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
-
 /*
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
@@ -181,12 +211,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
 int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 			     int wait)
 {
-	struct call_single_data d = {
-		.flags = 0,
-	};
-	unsigned long flags;
 	int this_cpu;
-	int err = 0;
+	int err;
 
 	/*
 	 * prevent preemption and reschedule on another processor,
@@ -203,26 +229,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 		     && !oops_in_progress);
 
-	if (cpu == this_cpu) {
-		local_irq_save(flags);
-		func(info);
-		local_irq_restore(flags);
-	} else {
-		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-			struct call_single_data *csd = &d;
-
-			if (!wait)
-				csd = &__get_cpu_var(csd_data);
-
-			csd_lock(csd);
-
-			csd->func = func;
-			csd->info = info;
-			generic_exec_single(cpu, csd, wait);
-		} else {
-			err = -ENXIO;	/* CPU not online */
-		}
-	}
+	err = generic_exec_single(cpu, NULL, func, info, wait);
 
 	put_cpu();
 
@@ -285,9 +292,8 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
  */
 int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
 {
-	unsigned int this_cpu;
-	unsigned long flags;
 	int err = 0;
+	int this_cpu;
 
 	this_cpu = get_cpu();
 	/*
@@ -296,20 +302,12 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
 	 * send smp call function interrupt to this cpu and as such deadlocks
 	 * can't happen.
 	 */
-	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
+	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
 		     && !oops_in_progress);
 
-	if (cpu == this_cpu) {
-		local_irq_save(flags);
-		csd->func(csd->info);
-		local_irq_restore(flags);
-	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		csd_lock(csd);
-		generic_exec_single(cpu, csd, wait);
-	} else {
-		err = -ENXIO;	/* CPU not online */
-	}
+	err = generic_exec_single(cpu, csd, csd->func, csd->info, wait);
 	put_cpu();
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(__smp_call_function_single);
-- 
cgit v1.3-14-g43fede


From d7877c03f1b62de06f9c00417952f39f56c1ab00 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 24 Feb 2014 16:39:59 +0100
Subject: smp: Move __smp_call_function_single() below its safe version

Move this function closer to __smp_call_function_single(). These functions
have very similar behavior and should be displayed in the same block
for clarity.

Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/smp.c | 64 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 64bb0d48e96f..fa04ab938e52 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -237,6 +237,38 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
+/**
+ * __smp_call_function_single(): Run a function on a specific CPU
+ * @cpu: The CPU to run on.
+ * @csd: Pre-allocated and setup data structure
+ * @wait: If true, wait until function has completed on specified CPU.
+ *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
+ */
+int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
+{
+	int err = 0;
+	int this_cpu;
+
+	this_cpu = get_cpu();
+	/*
+	 * Can deadlock when called with interrupts disabled.
+	 * We allow cpu's that are not yet online though, as no one else can
+	 * send smp call function interrupt to this cpu and as such deadlocks
+	 * can't happen.
+	 */
+	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
+		     && !oops_in_progress);
+
+	err = generic_exec_single(cpu, csd, csd->func, csd->info, wait);
+	put_cpu();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__smp_call_function_single);
+
 /*
  * smp_call_function_any - Run a function on any of the given cpus
  * @mask: The mask of cpus it can run on.
@@ -280,38 +312,6 @@ call:
 }
 EXPORT_SYMBOL_GPL(smp_call_function_any);
 
-/**
- * __smp_call_function_single(): Run a function on a specific CPU
- * @cpu: The CPU to run on.
- * @csd: Pre-allocated and setup data structure
- * @wait: If true, wait until function has completed on specified CPU.
- *
- * Like smp_call_function_single(), but allow caller to pass in a
- * pre-allocated data structure. Useful for embedding @data inside
- * other structures, for instance.
- */
-int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
-{
-	int err = 0;
-	int this_cpu;
-
-	this_cpu = get_cpu();
-	/*
-	 * Can deadlock when called with interrupts disabled.
-	 * We allow cpu's that are not yet online though, as no one else can
-	 * send smp call function interrupt to this cpu and as such deadlocks
-	 * can't happen.
-	 */
-	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
-		     && !oops_in_progress);
-
-	err = generic_exec_single(cpu, csd, csd->func, csd->info, wait);
-	put_cpu();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(__smp_call_function_single);
-
 /**
  * smp_call_function_many(): Run a function on a set of other CPUs.
  * @mask: The set of cpus to run on (only runs on online subset).
-- 
cgit v1.3-14-g43fede


From e0a23b0628b10d25f2c178be6fcfc17c1ab49fda Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 24 Feb 2014 16:40:00 +0100
Subject: watchdog: Simplify a little the IPI call

In order to remotely restart the watchdog hrtimer, update_timers()
allocates a csd on the stack and pass it to __smp_call_function_single().

There is no partcular need, however, for a specific csd here. Lets
simplify that a little by calling smp_call_function_single()
which can already take care of the csd allocation by itself.

Acked-by: Don Zickus <dzickus@redhat.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/watchdog.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4431610f049a..01c6f979486f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -505,7 +505,6 @@ static void restart_watchdog_hrtimer(void *info)
 
 static void update_timers(int cpu)
 {
-	struct call_single_data data = {.func = restart_watchdog_hrtimer};
 	/*
 	 * Make sure that perf event counter will adopt to a new
 	 * sampling period. Updating the sampling period directly would
@@ -515,7 +514,7 @@ static void update_timers(int cpu)
 	 * might be late already so we have to restart the timer as well.
 	 */
 	watchdog_nmi_disable(cpu);
-	__smp_call_function_single(cpu, &data, 1);
+	smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
 	watchdog_nmi_enable(cpu);
 }
 
-- 
cgit v1.3-14-g43fede


From fce8ad1568c57e7f334018dec4fa1744c926c135 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 24 Feb 2014 16:40:01 +0100
Subject: smp: Remove wait argument from __smp_call_function_single()

The main point of calling __smp_call_function_single() is to send
an IPI in a pure asynchronous way. By embedding a csd in an object,
a caller can send the IPI without waiting for a previous one to complete
as is required by smp_call_function_single() for example. As such,
sending this kind of IPI can be safe even when irqs are disabled.

This flexibility comes at the expense of the caller who then needs to
synchronize the csd lifecycle by himself and make sure that IPIs on a
single csd are serialized.

This is how __smp_call_function_single() works when wait = 0 and this
usecase is relevant.

Now there don't seem to be any usecase with wait = 1 that can't be
covered by smp_call_function_single() instead, which is safer. Lets look
at the two possible scenario:

1) The user calls __smp_call_function_single(wait = 1) on a csd embedded
   in an object. It looks like a nice and convenient pattern at the first
   sight because we can then retrieve the object from the IPI handler easily.

   But actually it is a waste of memory space in the object since the csd
   can be allocated from the stack by smp_call_function_single(wait = 1)
   and the object can be passed an the IPI argument.

   Besides that, embedding the csd in an object is more error prone
   because the caller must take care of the serialization of the IPIs
   for this csd.

2) The user calls __smp_call_function_single(wait = 1) on a csd that
   is allocated on the stack. It's ok but smp_call_function_single()
   can do it as well and it already takes care of the allocation on the
   stack. Again it's more simple and less error prone.

Therefore, using the underscore prepend API version with wait = 1
is a bad pattern and a sign that the caller can do safer and more
simple.

There was a single user of that which has just been converted.
So lets remove this option to discourage further users.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c            |  2 +-
 block/blk-softirq.c       |  2 +-
 drivers/cpuidle/coupled.c |  2 +-
 include/linux/smp.h       |  2 +-
 kernel/sched/core.c       |  2 +-
 kernel/smp.c              | 19 ++++---------------
 kernel/up.c               |  3 +--
 net/core/dev.c            |  2 +-
 8 files changed, 11 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1fa9dd153fde..62154edf1489 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -353,7 +353,7 @@ void __blk_mq_complete_request(struct request *rq)
 		rq->csd.func = __blk_mq_complete_request_remote;
 		rq->csd.info = rq;
 		rq->csd.flags = 0;
-		__smp_call_function_single(ctx->cpu, &rq->csd, 0);
+		__smp_call_function_single(ctx->cpu, &rq->csd);
 	} else {
 		rq->q->softirq_done_fn(rq);
 	}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index b5c37d96cf0e..6345b7ebd0df 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -70,7 +70,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data, 0);
+		__smp_call_function_single(cpu, data);
 		return 0;
 	}
 
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index e952936418d0..04115947accc 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -323,7 +323,7 @@ static void cpuidle_coupled_poke(int cpu)
 	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
 
 	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
-		__smp_call_function_single(cpu, csd, 0);
+		__smp_call_function_single(cpu, csd);
 }
 
 /**
diff --git a/include/linux/smp.h b/include/linux/smp.h
index c39074c794c5..b410a1f23281 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,7 +50,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 		smp_call_func_t func, void *info, bool wait,
 		gfp_t gfp_flags);
 
-int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait);
+int __smp_call_function_single(int cpu, struct call_single_data *csd);
 
 #ifdef CONFIG_SMP
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..eba3d84765f3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		__hrtick_restart(rq);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index fa04ab938e52..b76763189752 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -241,29 +241,18 @@ EXPORT_SYMBOL(smp_call_function_single);
  * __smp_call_function_single(): Run a function on a specific CPU
  * @cpu: The CPU to run on.
  * @csd: Pre-allocated and setup data structure
- * @wait: If true, wait until function has completed on specified CPU.
  *
  * Like smp_call_function_single(), but allow caller to pass in a
  * pre-allocated data structure. Useful for embedding @data inside
  * other structures, for instance.
  */
-int __smp_call_function_single(int cpu, struct call_single_data *csd, int wait)
+int __smp_call_function_single(int cpu, struct call_single_data *csd)
 {
 	int err = 0;
-	int this_cpu;
 
-	this_cpu = get_cpu();
-	/*
-	 * Can deadlock when called with interrupts disabled.
-	 * We allow cpu's that are not yet online though, as no one else can
-	 * send smp call function interrupt to this cpu and as such deadlocks
-	 * can't happen.
-	 */
-	WARN_ON_ONCE(cpu_online(this_cpu) && wait && irqs_disabled()
-		     && !oops_in_progress);
-
-	err = generic_exec_single(cpu, csd, csd->func, csd->info, wait);
-	put_cpu();
+	preempt_disable();
+	err = generic_exec_single(cpu, csd, csd->func, csd->info, 0);
+	preempt_enable();
 
 	return err;
 }
diff --git a/kernel/up.c b/kernel/up.c
index cdf03d16840e..4e199d4cef8e 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,8 +22,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int __smp_call_function_single(int cpu, struct call_single_data *csd,
-			       int wait)
+int __smp_call_function_single(int cpu, struct call_single_data *csd)
 {
 	unsigned long flags;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 4ad1b78c9c77..d1298128bff4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4129,7 +4129,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 
 			if (cpu_online(remsd->cpu))
 				__smp_call_function_single(remsd->cpu,
-							   &remsd->csd, 0);
+							   &remsd->csd);
 			remsd = next;
 		}
 	} else
-- 
cgit v1.3-14-g43fede


From c46fff2a3b29794b35d717b5680a27f31a6a6bc0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 24 Feb 2014 16:40:02 +0100
Subject: smp: Rename __smp_call_function_single() to
 smp_call_function_single_async()

The name __smp_call_function_single() doesn't tell much about the
properties of this function, especially when compared to
smp_call_function_single().

The comments above the implementation are also misleading. The main
point of this function is actually not to be able to embed the csd
in an object. This is actually a requirement that result from the
purpose of this function which is to raise an IPI asynchronously.

As such it can be called with interrupts disabled. And this feature
comes at the cost of the caller who then needs to serialize the
IPIs on this csd.

Lets rename the function and enhance the comments so that they reflect
these properties.

Suggested-by: Christoph Hellwig <hch@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c            |  2 +-
 block/blk-softirq.c       |  2 +-
 drivers/cpuidle/coupled.c |  2 +-
 include/linux/smp.h       |  2 +-
 kernel/sched/core.c       |  2 +-
 kernel/smp.c              | 19 +++++++++++++------
 kernel/up.c               |  4 ++--
 net/core/dev.c            |  2 +-
 8 files changed, 21 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 62154edf1489..6468a715a0e4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -353,7 +353,7 @@ void __blk_mq_complete_request(struct request *rq)
 		rq->csd.func = __blk_mq_complete_request_remote;
 		rq->csd.info = rq;
 		rq->csd.flags = 0;
-		__smp_call_function_single(ctx->cpu, &rq->csd);
+		smp_call_function_single_async(ctx->cpu, &rq->csd);
 	} else {
 		rq->q->softirq_done_fn(rq);
 	}
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 6345b7ebd0df..ebd6b6f1bdeb 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -70,7 +70,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data);
+		smp_call_function_single_async(cpu, data);
 		return 0;
 	}
 
diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c
index 04115947accc..cb6654bfad77 100644
--- a/drivers/cpuidle/coupled.c
+++ b/drivers/cpuidle/coupled.c
@@ -323,7 +323,7 @@ static void cpuidle_coupled_poke(int cpu)
 	struct call_single_data *csd = &per_cpu(cpuidle_coupled_poke_cb, cpu);
 
 	if (!cpumask_test_and_set_cpu(cpu, &cpuidle_coupled_poke_pending))
-		__smp_call_function_single(cpu, csd);
+		smp_call_function_single_async(cpu, csd);
 }
 
 /**
diff --git a/include/linux/smp.h b/include/linux/smp.h
index b410a1f23281..633f5edd7470 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -50,7 +50,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
 		smp_call_func_t func, void *info, bool wait,
 		gfp_t gfp_flags);
 
-int __smp_call_function_single(int cpu, struct call_single_data *csd);
+int smp_call_function_single_async(int cpu, struct call_single_data *csd);
 
 #ifdef CONFIG_SMP
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eba3d84765f3..0cca04a53de0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -432,7 +432,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		__hrtick_restart(rq);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index b76763189752..06d574e42c72 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -238,15 +238,22 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 EXPORT_SYMBOL(smp_call_function_single);
 
 /**
- * __smp_call_function_single(): Run a function on a specific CPU
+ * smp_call_function_single_async(): Run an asynchronous function on a
+ * 			         specific CPU.
  * @cpu: The CPU to run on.
  * @csd: Pre-allocated and setup data structure
  *
- * Like smp_call_function_single(), but allow caller to pass in a
- * pre-allocated data structure. Useful for embedding @data inside
- * other structures, for instance.
+ * Like smp_call_function_single(), but the call is asynchonous and
+ * can thus be done from contexts with disabled interrupts.
+ *
+ * The caller passes his own pre-allocated data structure
+ * (ie: embedded in an object) and is responsible for synchronizing it
+ * such that the IPIs performed on the @csd are strictly serialized.
+ *
+ * NOTE: Be careful, there is unfortunately no current debugging facility to
+ * validate the correctness of this serialization.
  */
-int __smp_call_function_single(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
 {
 	int err = 0;
 
@@ -256,7 +263,7 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(__smp_call_function_single);
+EXPORT_SYMBOL_GPL(smp_call_function_single_async);
 
 /*
  * smp_call_function_any - Run a function on any of the given cpus
diff --git a/kernel/up.c b/kernel/up.c
index 4e199d4cef8e..1760bf3d1463 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,7 +22,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
-int __smp_call_function_single(int cpu, struct call_single_data *csd)
+int smp_call_function_single_async(int cpu, struct call_single_data *csd)
 {
 	unsigned long flags;
 
@@ -31,7 +31,7 @@ int __smp_call_function_single(int cpu, struct call_single_data *csd)
 	local_irq_restore(flags);
 	return 0;
 }
-EXPORT_SYMBOL(__smp_call_function_single);
+EXPORT_SYMBOL(smp_call_function_single_async);
 
 int on_each_cpu(smp_call_func_t func, void *info, int wait)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index d1298128bff4..ac7a2abb7f1a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4128,7 +4128,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
 			struct softnet_data *next = remsd->rps_ipi_next;
 
 			if (cpu_online(remsd->cpu))
-				__smp_call_function_single(remsd->cpu,
+				smp_call_function_single_async(remsd->cpu,
 							   &remsd->csd);
 			remsd = next;
 		}
-- 
cgit v1.3-14-g43fede


From c75611282cf1bf717c1866e7a7eb4d0743815187 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:01 -0500
Subject: cgroup: add css_set->mg_tasks

Currently, while migrating tasks from one cgroup to another,
cgroup_attach_task() builds a flex array of all target tasks;
unfortunately, this has a couple issues.

* Flex array has size limit.  On 64bit, struct task_and_cgroup is
  24bytes making the flex element limit around 87k.  It is a high
  number but not impossible to hit.  This means that the current
  cgroup implementation can't migrate a process with more than 87k
  threads.

* Process migration involves memory allocation whose size is dependent
  on the number of threads the process has.  This means that cgroup
  core can't guarantee success or failure of multi-process migrations
  as memory allocation failure can happen in the middle.  This is in
  part because cgroup can't grab threadgroup locks of multiple
  processes at the same time, so when there are multiple processes to
  migrate, it is imposible to tell how many tasks are to be migrated
  beforehand.

  Note that this already affects cgroup_transfer_tasks().  cgroup
  currently cannot guarantee atomic success or failure of the
  operation.  It may fail in the middle and after such failure cgroup
  doesn't have enough information to roll back properly.  It just
  aborts with some tasks migrated and others not.

To resolve the situation, we're going to use task->cg_list during
migration too.  Instead of building a separate array, target tasks
will be linked into a dedicated migration list_head on the owning
css_set.  Tasks on the migration list are treated the same as tasks on
the usual tasks list; however, being on a separate list allows cgroup
migration code path to keep track of the target tasks by simply
keeping the list of css_sets with tasks being migrated, making
unpredictable dynamic allocation unnecessary.

In prepartion of such migration path update, this patch introduces
css_set->mg_tasks list and updates css_set task iterations so that
they walk both css_set->tasks and ->mg_tasks.  Note that ->mg_tasks
isn't used yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  8 ++++++--
 kernel/cgroup.c        | 56 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 43 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8c283a910b91..528e2aed36c3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -324,10 +324,14 @@ struct css_set {
 	struct hlist_node hlist;
 
 	/*
-	 * List running through all tasks using this cgroup
-	 * group. Protected by css_set_lock
+	 * Lists running through all tasks using this cgroup group.
+	 * mg_tasks lists tasks which belong to this cset but are in the
+	 * process of being migrated out or in.  Protected by
+	 * css_set_rwsem, but, during migration, once tasks are moved to
+	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
+	struct list_head mg_tasks;
 
 	/*
 	 * List of cgrp_cset_links pointing at cgroups referenced from this
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8ab800c7bac0..b80c611ff836 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	atomic_set(&cset->refcount, 1);
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
+	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_HLIST_NODE(&cset->hlist);
 
 	/* Copy the set of subsystem state objects generated in
@@ -2590,9 +2591,14 @@ static void css_advance_task_iter(struct css_task_iter *it)
 		}
 		link = list_entry(l, struct cgrp_cset_link, cset_link);
 		cset = link->cset;
-	} while (list_empty(&cset->tasks));
+	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+
 	it->cset_link = l;
-	it->task = cset->tasks.next;
+
+	if (!list_empty(&cset->tasks))
+		it->task = cset->tasks.next;
+	else
+		it->task = cset->mg_tasks.next;
 }
 
 /**
@@ -2636,24 +2642,29 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
-	struct cgrp_cset_link *link;
+	struct cgrp_cset_link *link = list_entry(it->cset_link,
+					struct cgrp_cset_link, cset_link);
 
 	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cset_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
-	/* Advance iterator to find next entry */
+
+	/*
+	 * Advance iterator to find next entry.  cset->tasks is consumed
+	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+	 * next cset.
+	 */
 	l = l->next;
-	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
-	if (l == &link->cset->tasks) {
-		/*
-		 * We reached the end of this task list - move on to the
-		 * next cgrp_cset_link.
-		 */
+
+	if (l == &link->cset->tasks)
+		l = link->cset->mg_tasks.next;
+
+	if (l == &link->cset->mg_tasks)
 		css_advance_task_iter(it);
-	} else {
+	else
 		it->task = l;
-	}
+
 	return res;
 }
 
@@ -4502,16 +4513,23 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
+
 		seq_printf(seq, "css_set %p\n", cset);
+
 		list_for_each_entry(task, &cset->tasks, cg_list) {
-			if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
-				seq_puts(seq, "  ...\n");
-				break;
-			} else {
-				seq_printf(seq, "  task %d\n",
-					   task_pid_vnr(task));
-			}
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
+		}
+
+		list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+			if (count++ > MAX_TASKS_SHOWN_PER_CSS)
+				goto overflow;
+			seq_printf(seq, "  task %d\n", task_pid_vnr(task));
 		}
+		continue;
+	overflow:
+		seq_puts(seq, "  ...\n");
 	}
 	up_read(&css_set_rwsem);
 	return 0;
-- 
cgit v1.3-14-g43fede


From b3dc094e93905ae9c1bc0815402ad8e5b203d068 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:01 -0500
Subject: cgroup: use css_set->mg_tasks to track target tasks during migration

Currently, while migrating tasks from one cgroup to another,
cgroup_attach_task() builds a flex array of all target tasks;
unfortunately, this has a couple issues.

* Flex array has size limit.  On 64bit, struct task_and_cgroup is
  24bytes making the flex element limit around 87k.  It is a high
  number but not impossible to hit.  This means that the current
  cgroup implementation can't migrate a process with more than 87k
  threads.

* Process migration involves memory allocation whose size is dependent
  on the number of threads the process has.  This means that cgroup
  core can't guarantee success or failure of multi-process migrations
  as memory allocation failure can happen in the middle.  This is in
  part because cgroup can't grab threadgroup locks of multiple
  processes at the same time, so when there are multiple processes to
  migrate, it is imposible to tell how many tasks are to be migrated
  beforehand.

  Note that this already affects cgroup_transfer_tasks().  cgroup
  currently cannot guarantee atomic success or failure of the
  operation.  It may fail in the middle and after such failure cgroup
  doesn't have enough information to roll back properly.  It just
  aborts with some tasks migrated and others not.

To resolve the situation, this patch updates the migration path to use
task->cg_list to track target tasks.  The previous patch already added
css_set->mg_tasks and updated iterations in non-migration paths to
include them during task migration.  This patch updates migration path
to actually make use of it.

Instead of putting onto a flex_array, each target task is moved from
its css_set->tasks list to css_set->mg_tasks and the migration path
keeps trace of all the source css_sets and the associated cgroups.
Once all source css_sets are determined, the destination css_set for
each is determined, linked to the matching source css_set and put on a
separate list.

To iterate the target tasks, migration path just needs to iterat
through either the source or target css_sets, depending on whether
migration has been committed or not, and the tasks on their ->mg_tasks
lists.  cgroup_taskset is updated to contain the list_heads for source
and target css_sets and the iteration cursor.  cgroup_taskset_*() are
accordingly updated to walk through css_sets and their ->mg_tasks.

This resolves the above listed issues with moderate additional
complexity.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  16 ++++
 kernel/cgroup.c        | 223 +++++++++++++++++++++++++------------------------
 2 files changed, 131 insertions(+), 108 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 528e2aed36c3..3a1cb265afd6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -346,6 +346,22 @@ struct css_set {
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
+	/*
+	 * List of csets participating in the on-going migration either as
+	 * source or destination.  Protected by cgroup_mutex.
+	 */
+	struct list_head mg_node;
+
+	/*
+	 * If this cset is acting as the source of migration the following
+	 * two fields are set.  mg_src_cgrp is the source cgroup of the
+	 * on-going migration and mg_dst_cset is the destination cset the
+	 * target tasks on this cset should be migrated to.  Protected by
+	 * cgroup_mutex.
+	 */
+	struct cgroup *mg_src_cgrp;
+	struct css_set *mg_dst_cset;
+
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b80c611ff836..5def4a800425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/idr.h>
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
-#include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
@@ -645,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
+	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_HLIST_NODE(&cset->hlist);
 
 	/* Copy the set of subsystem state objects generated in
@@ -1639,20 +1639,26 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 }
 EXPORT_SYMBOL_GPL(task_cgroup_path);
 
-/*
- * Control Group taskset
- */
-struct task_and_cgroup {
-	struct task_struct	*task;
-	struct cgroup		*cgrp;
-	struct css_set		*cset;
-};
-
+/* used to track tasks and other necessary states during migration */
 struct cgroup_taskset {
-	struct task_and_cgroup	single;
-	struct flex_array	*tc_array;
-	int			tc_array_len;
-	int			idx;
+	/* the src and dst cset list running through cset->mg_node */
+	struct list_head	src_csets;
+	struct list_head	dst_csets;
+
+	/*
+	 * Fields for cgroup_taskset_*() iteration.
+	 *
+	 * Before migration is committed, the target migration tasks are on
+	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
+	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
+	 * or ->dst_csets depending on whether migration is committed.
+	 *
+	 * ->cur_csets and ->cur_task point to the current task position
+	 * during iteration.
+	 */
+	struct list_head	*csets;
+	struct css_set		*cur_cset;
+	struct task_struct	*cur_task;
 };
 
 /**
@@ -1663,12 +1669,10 @@ struct cgroup_taskset {
  */
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
 {
-	if (tset->tc_array) {
-		tset->idx = 0;
-		return cgroup_taskset_next(tset);
-	} else {
-		return tset->single.task;
-	}
+	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
+	tset->cur_task = NULL;
+
+	return cgroup_taskset_next(tset);
 }
 
 /**
@@ -1680,13 +1684,27 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
  */
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 {
-	struct task_and_cgroup *tc;
+	struct css_set *cset = tset->cur_cset;
+	struct task_struct *task = tset->cur_task;
 
-	if (!tset->tc_array || tset->idx >= tset->tc_array_len)
-		return NULL;
+	while (&cset->mg_node != tset->csets) {
+		if (!task)
+			task = list_first_entry(&cset->mg_tasks,
+						struct task_struct, cg_list);
+		else
+			task = list_next_entry(task, cg_list);
 
-	tc = flex_array_get(tset->tc_array, tset->idx++);
-	return tc->task;
+		if (&task->cg_list != &cset->mg_tasks) {
+			tset->cur_cset = cset;
+			tset->cur_task = task;
+			return task;
+		}
+
+		cset = list_next_entry(cset, mg_node);
+		task = NULL;
+	}
+
+	return NULL;
 }
 
 /**
@@ -1714,11 +1732,13 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
 
+	get_css_set(new_cset);
+
 	task_lock(tsk);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	task_unlock(tsk);
 
-	list_move(&tsk->cg_list, &new_cset->tasks);
+	list_move(&tsk->cg_list, &new_cset->mg_tasks);
 
 	/*
 	 * We just gained a reference on old_cset by taking it from the
@@ -1741,80 +1761,58 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 			      bool threadgroup)
 {
-	int ret, i, group_size;
-	struct cgroupfs_root *root = cgrp->root;
+	struct cgroup_taskset tset = {
+		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
+		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
+		.csets		= &tset.src_csets,
+	};
 	struct cgroup_subsys_state *css, *failed_css = NULL;
-	/* threadgroup list cursor and array */
-	struct task_struct *task;
-	struct task_and_cgroup *tc;
-	struct flex_array *group;
-	struct cgroup_taskset tset = { };
-
-	/*
-	 * step 0: in order to do expensive, possibly blocking operations for
-	 * every thread, we cannot iterate the thread group list, since it needs
-	 * rcu or tasklist locked. instead, build an array of all threads in the
-	 * group - group_rwsem prevents new threads from appearing, and if
-	 * threads exit, this will just be an over-estimate.
-	 */
-	if (threadgroup)
-		group_size = get_nr_threads(leader);
-	else
-		group_size = 1;
-	/* flex_array supports very large thread-groups better than kmalloc. */
-	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
-	if (!group)
-		return -ENOMEM;
-	/* pre-allocate to guarantee space while iterating in rcu read-side. */
-	ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
-	if (ret)
-		goto out_free_group_list;
+	struct css_set *cset, *tmp_cset;
+	struct task_struct *task, *tmp_task;
+	int i, ret;
 
-	i = 0;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	down_read(&css_set_rwsem);
+	down_write(&css_set_rwsem);
 	rcu_read_lock();
 	task = leader;
 	do {
-		struct task_and_cgroup ent;
+		struct cgroup *src_cgrp;
 
 		/* @task either already exited or can't exit until the end */
 		if (task->flags & PF_EXITING)
 			goto next;
 
-		/* as per above, nr_threads may decrease, but not increase. */
-		BUG_ON(i >= group_size);
-		ent.task = task;
-		ent.cgrp = task_cgroup_from_root(task, root);
+		cset = task_css_set(task);
+		src_cgrp = task_cgroup_from_root(task, cgrp->root);
+
 		/* nothing to do if this task is already in the cgroup */
-		if (ent.cgrp == cgrp)
+		if (src_cgrp == cgrp)
 			goto next;
-		/*
-		 * saying GFP_ATOMIC has no effect here because we did prealloc
-		 * earlier, but it's good form to communicate our expectations.
-		 */
-		ret = flex_array_put(group, i, &ent, GFP_ATOMIC);
-		BUG_ON(ret != 0);
-		i++;
+
+		if (!cset->mg_src_cgrp) {
+			WARN_ON(!list_empty(&cset->mg_tasks));
+			WARN_ON(!list_empty(&cset->mg_node));
+
+			cset->mg_src_cgrp = src_cgrp;
+			list_add(&cset->mg_node, &tset.src_csets);
+			get_css_set(cset);
+		}
+
+		list_move(&task->cg_list, &cset->mg_tasks);
 	next:
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	up_read(&css_set_rwsem);
-	/* remember the number of threads in the array for later. */
-	group_size = i;
-	tset.tc_array = group;
-	tset.tc_array_len = group_size;
+	up_write(&css_set_rwsem);
 
 	/* methods shouldn't be called if no task is actually migrating */
-	ret = 0;
-	if (!group_size)
-		goto out_free_group_list;
+	if (list_empty(&tset.src_csets))
+		return 0;
 
 	/*
 	 * step 1: check that we can legitimately attach to the cgroup.
@@ -1833,16 +1831,21 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	 * step 2: make sure css_sets exist for all threads to be migrated.
 	 * we use find_css_set, which allocates a new one if necessary.
 	 */
-	for (i = 0; i < group_size; i++) {
-		struct css_set *old_cset;
+	list_for_each_entry(cset, &tset.src_csets, mg_node) {
+		struct css_set *dst_cset;
 
-		tc = flex_array_get(group, i);
-		old_cset = task_css_set(tc->task);
-		tc->cset = find_css_set(old_cset, cgrp);
-		if (!tc->cset) {
+		dst_cset = find_css_set(cset, cgrp);
+		if (!dst_cset) {
 			ret = -ENOMEM;
-			goto out_put_css_set_refs;
+			goto out_release_tset;
 		}
+
+		if (list_empty(&dst_cset->mg_node))
+			list_add(&dst_cset->mg_node, &tset.dst_csets);
+		else
+			put_css_set(dst_cset, false);
+
+		cset->mg_dst_cset = dst_cset;
 	}
 
 	/*
@@ -1851,12 +1854,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	 * failure cases after here, so this is the commit point.
 	 */
 	down_write(&css_set_rwsem);
-	for (i = 0; i < group_size; i++) {
-		tc = flex_array_get(group, i);
-		cgroup_task_migrate(tc->cgrp, tc->task, tc->cset);
+	list_for_each_entry(cset, &tset.src_csets, mg_node) {
+		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+			cgroup_task_migrate(cset->mg_src_cgrp, task,
+					    cset->mg_dst_cset);
 	}
 	up_write(&css_set_rwsem);
-	/* nothing is sensitive to fork() after this point. */
+
+	/* migration is committed, all target tasks are now on dst_csets */
+	tset.csets = &tset.dst_csets;
+
+	/* nothing is sensitive to fork() after this point */
 
 	/*
 	 * step 4: do subsystem attach callbacks.
@@ -1865,30 +1873,27 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
 
-	/*
-	 * step 5: success! and cleanup
-	 */
 	ret = 0;
-out_put_css_set_refs:
-	if (ret) {
-		for (i = 0; i < group_size; i++) {
-			tc = flex_array_get(group, i);
-			if (!tc->cset)
-				break;
-			put_css_set(tc->cset, false);
-		}
-	}
+	goto out_release_tset;
+
 out_cancel_attach:
-	if (ret) {
-		for_each_css(css, i, cgrp) {
-			if (css == failed_css)
-				break;
-			if (css->ss->cancel_attach)
-				css->ss->cancel_attach(css, &tset);
-		}
+	for_each_css(css, i, cgrp) {
+		if (css == failed_css)
+			break;
+		if (css->ss->cancel_attach)
+			css->ss->cancel_attach(css, &tset);
 	}
-out_free_group_list:
-	flex_array_free(group);
+out_release_tset:
+	down_write(&css_set_rwsem);
+	list_splice_init(&tset.dst_csets, &tset.src_csets);
+	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
+		list_splice_init(&cset->mg_tasks, &cset->tasks);
+		cset->mg_dst_cset = NULL;
+		cset->mg_src_cgrp = NULL;
+		list_del_init(&cset->mg_node);
+		put_css_set_locked(cset, false);
+	}
+	up_write(&css_set_rwsem);
 	return ret;
 }
 
@@ -3895,6 +3900,8 @@ int __init cgroup_init_early(void)
 	atomic_set(&init_css_set.refcount, 1);
 	INIT_LIST_HEAD(&init_css_set.cgrp_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
+	INIT_LIST_HEAD(&init_css_set.mg_tasks);
+	INIT_LIST_HEAD(&init_css_set.mg_node);
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
 	init_cgroup_root(&cgroup_dummy_root);
-- 
cgit v1.3-14-g43fede


From ceb6a081f6f52d17ec9e46e271cc26a1eb8a7573 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:02 -0500
Subject: cgroup: separate out cset_group_from_root() from
 task_cgroup_from_root()

This will be used by the planned migration path update.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5def4a800425..23e3a8c74bd4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -758,25 +758,15 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	cgroup_free_root(root);
 }
 
-/*
- * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
- */
-static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 					    struct cgroupfs_root *root)
 {
-	struct css_set *cset;
 	struct cgroup *res = NULL;
 
 	lockdep_assert_held(&cgroup_mutex);
 	lockdep_assert_held(&css_set_rwsem);
 
-	/*
-	 * No need to lock the task - since we hold cgroup_mutex the
-	 * task can't change groups, so the only thing that can happen
-	 * is that it exits and its css is set back to init_css_set.
-	 */
-	cset = task_css_set(task);
 	if (cset == &init_css_set) {
 		res = &root->top_cgroup;
 	} else {
@@ -796,6 +786,21 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 	return res;
 }
 
+/*
+ * Return the cgroup for "task" from the given hierarchy. Must be
+ * called with cgroup_mutex and css_set_rwsem held.
+ */
+static struct cgroup *task_cgroup_from_root(struct task_struct *task,
+					    struct cgroupfs_root *root)
+{
+	/*
+	 * No need to lock the task - since we hold cgroup_mutex the
+	 * task can't change groups, so the only thing that can happen
+	 * is that it exits and its css is set back to init_css_set.
+	 */
+	return cset_cgroup_from_root(task_css_set(task), root);
+}
+
 /*
  * There is one global cgroup mutex. We also require taking
  * task_lock() when dereferencing a task's cgroup subsys pointers.
-- 
cgit v1.3-14-g43fede


From 1958d2d53dadbb1c9aaf0b37741f13a60098b243 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: split process / task migration into four steps

Currently, process / task migration is a single operation which may
fail depending on memory pressure or the involved controllers'
->can_attach() callbacks.  One problem with this approach is migration
of multiple targets.  It's impossible to tell whether a given target
will be successfully migrated beforehand and cgroup core can't keep
track of enough states to roll back after intermediate failure.

This is already an issue with cgroup_transfer_tasks().  Also, we're
gonna need multiple target migration for unified hierarchy.

This patch splits migration into four stages -
cgroup_migrate_add_src(), cgroup_migrate_prepare_dst(),
cgroup_migrate() and cgroup_migrate_finish(), where
cgroup_migrate_prepare_dst() performs all the operations which may
fail due to allocation failure without actually migrating the target.

The four separate stages mean that, disregarding ->can_attach()
failures, the success or failure of multi target migration can be
determined before performing any actual migration.  If preparations of
all targets succeed, the whole thing will succeed.  If not, the whole
operation can fail without any side-effect.

Since the previous patch to use css_set->mg_tasks to keep track of
migration targets, the only thing which may need memory allocation
during migration is the target css_sets.  cgroup_migrate_prepare()
pins all source and target css_sets and link them up.  Note that this
can be performed without holding threadgroup_lock even if the target
is a process.  As long as cgroup_mutex is held, no new css_set can be
put into play.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |   1 +
 kernel/cgroup.c        | 240 +++++++++++++++++++++++++++++++++++++------------
 2 files changed, 182 insertions(+), 59 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3a1cb265afd6..4829a577c1b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -350,6 +350,7 @@ struct css_set {
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
+	struct list_head mg_preload_node;
 	struct list_head mg_node;
 
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 23e3a8c74bd4..a93f6f1ebc69 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -644,6 +644,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_LIST_HEAD(&cset->mg_tasks);
+	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
 	INIT_HLIST_NODE(&cset->hlist);
 
@@ -1755,16 +1756,137 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 }
 
 /**
- * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
- * @cgrp: the cgroup to attach to
- * @leader: the task or the leader of the threadgroup to be attached
- * @threadgroup: attach the whole threadgroup?
+ * cgroup_migrate_finish - cleanup after attach
+ * @preloaded_csets: list of preloaded css_sets
  *
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of @tsk or each thread in the threadgroup individually in turn.
+ * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
+ * those functions for details.
  */
-static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
-			      bool threadgroup)
+static void cgroup_migrate_finish(struct list_head *preloaded_csets)
+{
+	struct css_set *cset, *tmp_cset;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	down_write(&css_set_rwsem);
+	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
+		cset->mg_src_cgrp = NULL;
+		cset->mg_dst_cset = NULL;
+		list_del_init(&cset->mg_preload_node);
+		put_css_set_locked(cset, false);
+	}
+	up_write(&css_set_rwsem);
+}
+
+/**
+ * cgroup_migrate_add_src - add a migration source css_set
+ * @src_cset: the source css_set to add
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded css_sets
+ *
+ * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
+ * @src_cset and add it to @preloaded_csets, which should later be cleaned
+ * up by cgroup_migrate_finish().
+ *
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process.  Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
+ */
+static void cgroup_migrate_add_src(struct css_set *src_cset,
+				   struct cgroup *dst_cgrp,
+				   struct list_head *preloaded_csets)
+{
+	struct cgroup *src_cgrp;
+
+	lockdep_assert_held(&cgroup_mutex);
+	lockdep_assert_held(&css_set_rwsem);
+
+	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
+	/* nothing to do if this cset already belongs to the cgroup */
+	if (src_cgrp == dst_cgrp)
+		return;
+
+	if (!list_empty(&src_cset->mg_preload_node))
+		return;
+
+	WARN_ON(src_cset->mg_src_cgrp);
+	WARN_ON(!list_empty(&src_cset->mg_tasks));
+	WARN_ON(!list_empty(&src_cset->mg_node));
+
+	src_cset->mg_src_cgrp = src_cgrp;
+	get_css_set(src_cset);
+	list_add(&src_cset->mg_preload_node, preloaded_csets);
+}
+
+/**
+ * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
+ * @dst_cgrp: the destination cgroup
+ * @preloaded_csets: list of preloaded source css_sets
+ *
+ * Tasks are about to be moved to @dst_cgrp and all the source css_sets
+ * have been preloaded to @preloaded_csets.  This function looks up and
+ * pins all destination css_sets, links each to its source, and put them on
+ * @preloaded_csets.
+ *
+ * This function must be called after cgroup_migrate_add_src() has been
+ * called on each migration source css_set.  After migration is performed
+ * using cgroup_migrate(), cgroup_migrate_finish() must be called on
+ * @preloaded_csets.
+ */
+static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
+				      struct list_head *preloaded_csets)
+{
+	LIST_HEAD(csets);
+	struct css_set *src_cset;
+
+	lockdep_assert_held(&cgroup_mutex);
+
+	/* look up the dst cset for each src cset and link it to src */
+	list_for_each_entry(src_cset, preloaded_csets, mg_preload_node) {
+		struct css_set *dst_cset;
+
+		dst_cset = find_css_set(src_cset, dst_cgrp);
+		if (!dst_cset)
+			goto err;
+
+		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
+		src_cset->mg_dst_cset = dst_cset;
+
+		if (list_empty(&dst_cset->mg_preload_node))
+			list_add(&dst_cset->mg_preload_node, &csets);
+		else
+			put_css_set(dst_cset, false);
+	}
+
+	list_splice(&csets, preloaded_csets);
+	return 0;
+err:
+	cgroup_migrate_finish(&csets);
+	return -ENOMEM;
+}
+
+/**
+ * cgroup_migrate - migrate a process or task to a cgroup
+ * @cgrp: the destination cgroup
+ * @leader: the leader of the process or the task to migrate
+ * @threadgroup: whether @leader points to the whole process or a single task
+ *
+ * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
+ * process, the caller must be holding threadgroup_lock of @leader.  The
+ * caller is also responsible for invoking cgroup_migrate_add_src() and
+ * cgroup_migrate_prepare_dst() on the targets before invoking this
+ * function and following up with cgroup_migrate_finish().
+ *
+ * As long as a controller's ->can_attach() doesn't fail, this function is
+ * guaranteed to succeed.  This means that, excluding ->can_attach()
+ * failure, when migrating multiple targets, the success or failure can be
+ * decided for all targets by invoking group_migrate_prepare_dst() before
+ * actually starting migrating.
+ */
+static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
+			  bool threadgroup)
 {
 	struct cgroup_taskset tset = {
 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
@@ -1785,29 +1907,17 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	rcu_read_lock();
 	task = leader;
 	do {
-		struct cgroup *src_cgrp;
-
 		/* @task either already exited or can't exit until the end */
 		if (task->flags & PF_EXITING)
 			goto next;
 
 		cset = task_css_set(task);
-		src_cgrp = task_cgroup_from_root(task, cgrp->root);
-
-		/* nothing to do if this task is already in the cgroup */
-		if (src_cgrp == cgrp)
+		if (!cset->mg_src_cgrp)
 			goto next;
 
-		if (!cset->mg_src_cgrp) {
-			WARN_ON(!list_empty(&cset->mg_tasks));
-			WARN_ON(!list_empty(&cset->mg_node));
-
-			cset->mg_src_cgrp = src_cgrp;
-			list_add(&cset->mg_node, &tset.src_csets);
-			get_css_set(cset);
-		}
-
 		list_move(&task->cg_list, &cset->mg_tasks);
+		list_move(&cset->mg_node, &tset.src_csets);
+		list_move(&cset->mg_dst_cset->mg_node, &tset.dst_csets);
 	next:
 		if (!threadgroup)
 			break;
@@ -1819,9 +1929,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	if (list_empty(&tset.src_csets))
 		return 0;
 
-	/*
-	 * step 1: check that we can legitimately attach to the cgroup.
-	 */
+	/* check that we can legitimately attach to the cgroup */
 	for_each_css(css, i, cgrp) {
 		if (css->ss->can_attach) {
 			ret = css->ss->can_attach(css, &tset);
@@ -1833,30 +1941,9 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	}
 
 	/*
-	 * step 2: make sure css_sets exist for all threads to be migrated.
-	 * we use find_css_set, which allocates a new one if necessary.
-	 */
-	list_for_each_entry(cset, &tset.src_csets, mg_node) {
-		struct css_set *dst_cset;
-
-		dst_cset = find_css_set(cset, cgrp);
-		if (!dst_cset) {
-			ret = -ENOMEM;
-			goto out_release_tset;
-		}
-
-		if (list_empty(&dst_cset->mg_node))
-			list_add(&dst_cset->mg_node, &tset.dst_csets);
-		else
-			put_css_set(dst_cset, false);
-
-		cset->mg_dst_cset = dst_cset;
-	}
-
-	/*
-	 * step 3: now that we're guaranteed success wrt the css_sets,
-	 * proceed to move all tasks to the new cgroup.  There are no
-	 * failure cases after here, so this is the commit point.
+	 * Now that we're guaranteed success, proceed to move all tasks to
+	 * the new cgroup.  There are no failure cases after here, so this
+	 * is the commit point.
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset.src_csets, mg_node) {
@@ -1866,14 +1953,13 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader,
 	}
 	up_write(&css_set_rwsem);
 
-	/* migration is committed, all target tasks are now on dst_csets */
-	tset.csets = &tset.dst_csets;
-
-	/* nothing is sensitive to fork() after this point */
-
 	/*
-	 * step 4: do subsystem attach callbacks.
+	 * Migration is committed, all target tasks are now on dst_csets.
+	 * Nothing is sensitive to fork() after this point.  Notify
+	 * controllers that migration is complete.
 	 */
+	tset.csets = &tset.dst_csets;
+
 	for_each_css(css, i, cgrp)
 		if (css->ss->attach)
 			css->ss->attach(css, &tset);
@@ -1893,15 +1979,50 @@ out_release_tset:
 	list_splice_init(&tset.dst_csets, &tset.src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
 		list_splice_init(&cset->mg_tasks, &cset->tasks);
-		cset->mg_dst_cset = NULL;
-		cset->mg_src_cgrp = NULL;
 		list_del_init(&cset->mg_node);
-		put_css_set_locked(cset, false);
 	}
 	up_write(&css_set_rwsem);
 	return ret;
 }
 
+/**
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
+ * @dst_cgrp: the cgroup to attach to
+ * @leader: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
+ *
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
+ */
+static int cgroup_attach_task(struct cgroup *dst_cgrp,
+			      struct task_struct *leader, bool threadgroup)
+{
+	LIST_HEAD(preloaded_csets);
+	struct task_struct *task;
+	int ret;
+
+	/* look up all src csets */
+	down_read(&css_set_rwsem);
+	rcu_read_lock();
+	task = leader;
+	do {
+		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
+				       &preloaded_csets);
+		if (!threadgroup)
+			break;
+	} while_each_thread(leader, task);
+	rcu_read_unlock();
+	up_read(&css_set_rwsem);
+
+	/* prepare dst csets and commit */
+	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
+	if (!ret)
+		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+
+	cgroup_migrate_finish(&preloaded_csets);
+	return ret;
+}
+
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
@@ -3906,6 +4027,7 @@ int __init cgroup_init_early(void)
 	INIT_LIST_HEAD(&init_css_set.cgrp_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_LIST_HEAD(&init_css_set.mg_tasks);
+	INIT_LIST_HEAD(&init_css_set.mg_preload_node);
 	INIT_LIST_HEAD(&init_css_set.mg_node);
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
-- 
cgit v1.3-14-g43fede


From eaf797abc53b0ab3f0a02d4ef873a565fcce6daa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: update how a newly forked task gets associated with css_set

When a new process is forked, cgroup_fork() associates it with the
css_set of its parent but doesn't link it into it.  After the new
process is linked to tasklist, cgroup_post_fork() does the linking.

This is problematic for cgroup_transfer_tasks() as there's no way to
tell whether there are tasks which are pointing to a css_set but not
linked yet.  It is impossible to implement an operation which transfer
all tasks of a cgroup to another and the current
cgroup_transfer_tasks() can easily be tricked into leaving a newly
forked process behind if it gets called between cgroup_fork() and
cgroup_post_fork().

Let's make association with a css_set and linking atomic by moving it
to cgroup_post_fork().  cgroup_fork() sets child->cgroups to
init_css_set as a placeholder and cgroup_post_fork() is updated to
perform both the association with the parent's cgroup and linking
there.  This means that a newly created task will point to
init_css_set without holding a ref to it much like what it does on the
exit path.  Empty cg_list is used to indicate that the task isn't
holding a ref to the associated css_set.

This fixes an actual bug with cgroup_transfer_tasks(); however, I'm
not marking it for -stable.  The whole thing is broken in multiple
other ways which require invasive updates to fix and I don't think
it's worthwhile to bother with backporting this particular one.
Fortunately, the only user is cpuset and these bugs don't crash the
machine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 86 ++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a93f6f1ebc69..fa0567f4eedd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,8 +1342,12 @@ static void cgroup_enable_task_cg_lists(void)
 		 * racing against cgroup_exit().
 		 */
 		spin_lock_irq(&p->sighand->siglock);
-		if (!(p->flags & PF_EXITING))
-			list_add(&p->cg_list, &task_css_set(p)->tasks);
+		if (!(p->flags & PF_EXITING)) {
+			struct css_set *cset = task_css_set(p);
+
+			list_add(&p->cg_list, &cset->tasks);
+			get_css_set(cset);
+		}
 		spin_unlock_irq(&p->sighand->siglock);
 
 		task_unlock(p);
@@ -1911,6 +1915,10 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 		if (task->flags & PF_EXITING)
 			goto next;
 
+		/* leave @task alone if post_fork() hasn't linked it yet */
+		if (list_empty(&task->cg_list))
+			goto next;
+
 		cset = task_css_set(task);
 		if (!cset->mg_src_cgrp)
 			goto next;
@@ -2815,6 +2823,12 @@ void css_task_iter_end(struct css_task_iter *it)
  * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
  * @to: cgroup to which the tasks will be moved
  * @from: cgroup in which the tasks currently reside
+ *
+ * Locking rules between cgroup_post_fork() and the migration path
+ * guarantee that, if a task is forking while being migrated, the new child
+ * is guaranteed to be either visible in the source cgroup after the
+ * parent's migration is complete or put into the target cgroup.  No task
+ * can slip out of migration through forking.
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
@@ -4243,27 +4257,16 @@ static const struct file_operations proc_cgroupstats_operations = {
 };
 
 /**
- * cgroup_fork - attach newly forked task to its parents cgroup.
+ * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
  *
- * Description: A task inherits its parent's cgroup at fork().
- *
- * A pointer to the shared css_set was automatically copied in
- * fork.c by dup_task_struct().  However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer.  cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
- *
- * At the point that cgroup_fork() is called, 'current' is the parent
- * task, and the passed argument 'child' points to the child task.
+ * A task is associated with the init_css_set until cgroup_post_fork()
+ * attaches it to the parent's css_set.  Empty cg_list indicates that
+ * @child isn't holding reference to its css_set.
  */
 void cgroup_fork(struct task_struct *child)
 {
-	task_lock(current);
-	get_css_set(task_css_set(current));
-	child->cgroups = current->cgroups;
-	task_unlock(current);
+	RCU_INIT_POINTER(child->cgroups, &init_css_set);
 	INIT_LIST_HEAD(&child->cg_list);
 }
 
@@ -4283,21 +4286,38 @@ void cgroup_post_fork(struct task_struct *child)
 	int i;
 
 	/*
-	 * use_task_css_set_links is set to 1 before we walk the tasklist
-	 * under the tasklist_lock and we read it here after we added the child
-	 * to the tasklist under the tasklist_lock as well. If the child wasn't
-	 * yet in the tasklist when we walked through it from
-	 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
-	 * should be visible now due to the paired locking and barriers implied
-	 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
-	 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
-	 * lock on fork.
+	 * This may race against cgroup_enable_task_cg_links().  As that
+	 * function sets use_task_css_set_links before grabbing
+	 * tasklist_lock and we just went through tasklist_lock to add
+	 * @child, it's guaranteed that either we see the set
+	 * use_task_css_set_links or cgroup_enable_task_cg_lists() sees
+	 * @child during its iteration.
+	 *
+	 * If we won the race, @child is associated with %current's
+	 * css_set.  Grabbing css_set_rwsem guarantees both that the
+	 * association is stable, and, on completion of the parent's
+	 * migration, @child is visible in the source of migration or
+	 * already in the destination cgroup.  This guarantee is necessary
+	 * when implementing operations which need to migrate all tasks of
+	 * a cgroup to another.
+	 *
+	 * Note that if we lose to cgroup_enable_task_cg_links(), @child
+	 * will remain in init_css_set.  This is safe because all tasks are
+	 * in the init_css_set before cg_links is enabled and there's no
+	 * operation which transfers all tasks out of init_css_set.
 	 */
 	if (use_task_css_set_links) {
+		struct css_set *cset;
+
 		down_write(&css_set_rwsem);
+		cset = task_css_set_check(current,
+					  lockdep_is_held(&css_set_rwsem));
 		task_lock(child);
-		if (list_empty(&child->cg_list))
-			list_add(&child->cg_list, &task_css_set(child)->tasks);
+		if (list_empty(&child->cg_list)) {
+			rcu_assign_pointer(child->cgroups, cset);
+			list_add(&child->cg_list, &cset->tasks);
+			get_css_set(cset);
+		}
 		task_unlock(child);
 		up_write(&css_set_rwsem);
 	}
@@ -4353,6 +4373,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
+	bool put_cset = false;
 	int i;
 
 	/*
@@ -4361,8 +4382,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
-		if (!list_empty(&tsk->cg_list))
+		if (!list_empty(&tsk->cg_list)) {
 			list_del_init(&tsk->cg_list);
+			put_cset = true;
+		}
 		up_write(&css_set_rwsem);
 	}
 
@@ -4384,7 +4407,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	}
 	task_unlock(tsk);
 
-	put_css_set(cset, true);
+	if (put_cset)
+		put_css_set(cset, true);
 }
 
 static void check_for_release(struct cgroup *cgrp)
-- 
cgit v1.3-14-g43fede


From 0e1d768f1b1873272ec4e8dc1482bb5281855017 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: drop task_lock() protection around task->cgroups

For optimization, task_lock() is additionally used to protect
task->cgroups.  The optimization is pretty dubious as either
css_set_rwsem is grabbed anyway or PF_EXITING already protects
task->cgroups.  It adds only overhead and confusion at this point.
Let's drop task_[un]lock() and update comments accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  6 ++--
 kernel/cgroup.c        | 97 +++++++++++++-------------------------------------
 2 files changed, 28 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4829a577c1b9..acbb9a4cb6e9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -658,10 +658,12 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css)
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
+extern struct rw_semaphore css_set_rwsem;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
-		lockdep_is_held(&(task)->alloc_lock) ||			\
-		lockdep_is_held(&cgroup_mutex) || (__c))
+		lockdep_is_held(&cgroup_mutex) ||			\
+		lockdep_is_held(&css_set_rwsem) ||			\
+		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
 	rcu_dereference((task)->cgroups)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fa0567f4eedd..f783af900208 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -80,12 +80,21 @@ static DEFINE_MUTEX(cgroup_tree_mutex);
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
+ *
+ * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * objects, and the chain of tasks off each css_set.
+ *
+ * These locks are exported if CONFIG_PROVE_RCU so that accessors in
+ * cgroup.h can use them for lockdep annotations.
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for lockdep */
+DECLARE_RWSEM(css_set_rwsem);
+EXPORT_SYMBOL_GPL(cgroup_mutex);
+EXPORT_SYMBOL_GPL(css_set_rwsem);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
+static DECLARE_RWSEM(css_set_rwsem);
 #endif
 
 /*
@@ -338,12 +347,6 @@ struct cgrp_cset_link {
 
 static struct css_set init_css_set;
 static struct cgrp_cset_link init_cgrp_cset_link;
-
-/*
- * css_set_rwsem protects the list of css_set objects, and the chain of
- * tasks off each css_set.
- */
-static DECLARE_RWSEM(css_set_rwsem);
 static int css_set_count;
 
 /*
@@ -803,10 +806,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 }
 
 /*
- * There is one global cgroup mutex. We also require taking
- * task_lock() when dereferencing a task's cgroup subsys pointers.
- * See "The task_lock() exception", at the end of this comment.
- *
  * A task must hold cgroup_mutex to modify cgroups.
  *
  * Any task can increment and decrement the count field without lock.
@@ -836,18 +835,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * always has either children cgroups and/or using tasks.  So we don't
  * need a special hack to ensure that top_cgroup cannot be deleted.
  *
- *	The task_lock() exception
- *
- * The need for this exception arises from the action of
- * cgroup_attach_task(), which overwrites one task's cgroup pointer with
- * another.  It does so using cgroup_mutex, however there are
- * several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
- * mutex.  Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task's cgroup pointer we use
- * task_lock(), which acts on a spinlock (task->alloc_lock) already in
- * the task_struct routinely used for such matters.
- *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
@@ -1329,8 +1316,6 @@ static void cgroup_enable_task_cg_lists(void)
 	 */
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		task_lock(p);
-
 		WARN_ON_ONCE(!list_empty(&p->cg_list) ||
 			     task_css_set(p) != &init_css_set);
 
@@ -1349,8 +1334,6 @@ static void cgroup_enable_task_cg_lists(void)
 			get_css_set(cset);
 		}
 		spin_unlock_irq(&p->sighand->siglock);
-
-		task_unlock(p);
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
@@ -1743,11 +1726,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	old_cset = task_css_set(tsk);
 
 	get_css_set(new_cset);
-
-	task_lock(tsk);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
-	task_unlock(tsk);
-
 	list_move(&tsk->cg_list, &new_cset->mg_tasks);
 
 	/*
@@ -1999,8 +1978,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of @tsk or each thread in the threadgroup individually in turn.
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2034,7 +2012,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 /*
  * Find the task_struct of the task to attach by vpid and pass it along to the
  * function to attach either it or all tasks in its threadgroup. Will lock
- * cgroup_mutex and threadgroup; may take task_lock of task.
+ * cgroup_mutex and threadgroup.
  */
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
 {
@@ -4155,12 +4133,6 @@ core_initcall(cgroup_wq_init);
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
  *  - Used for /proc/<pid>/cgroup.
- *  - No need to task_lock(tsk) on this tsk->cgroup reference, as it
- *    doesn't really matter if tsk->cgroup changes after we read it,
- *    and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
- *    anyway.  No need to check that tsk->cgroup != NULL, thanks to
- *    the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
- *    cgroup to top_cgroup.
  */
 
 /* TODO: Use a proper seq_file iterator */
@@ -4310,15 +4282,12 @@ void cgroup_post_fork(struct task_struct *child)
 		struct css_set *cset;
 
 		down_write(&css_set_rwsem);
-		cset = task_css_set_check(current,
-					  lockdep_is_held(&css_set_rwsem));
-		task_lock(child);
+		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			rcu_assign_pointer(child->cgroups, cset);
 			list_add(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
-		task_unlock(child);
 		up_write(&css_set_rwsem);
 	}
 
@@ -4347,27 +4316,13 @@ void cgroup_post_fork(struct task_struct *child)
  * use notify_on_release cgroups where very high task exit scaling
  * is required on large systems.
  *
- * the_top_cgroup_hack:
- *
- *    Set the exiting tasks cgroup to the root cgroup (top_cgroup).
- *
- *    We call cgroup_exit() while the task is still competent to
- *    handle notify_on_release(), then leave the task attached to the
- *    root cgroup in each hierarchy for the remainder of its exit.
- *
- *    To do this properly, we would increment the reference count on
- *    top_cgroup, and near the very end of the kernel/exit.c do_exit()
- *    code we would add a second cgroup function call, to drop that
- *    reference.  This would just create an unnecessary hot spot on
- *    the top_cgroup reference count, to no avail.
- *
- *    Normally, holding a reference to a cgroup without bumping its
- *    count is unsafe.   The cgroup could go away, or someone could
- *    attach us to a different cgroup, decrementing the count on
- *    the first cgroup that we never incremented.  But in this case,
- *    top_cgroup isn't going away, and either task has PF_EXITING set,
- *    which wards off any cgroup_attach_task() attempts, or task is a failed
- *    fork, never visible to cgroup_attach_task.
+ * We set the exiting tasks cgroup to the root cgroup (top_cgroup).  We
+ * call cgroup_exit() while the task is still competent to handle
+ * notify_on_release(), then leave the task attached to the root cgroup in
+ * each hierarchy for the remainder of its exit.  No need to bother with
+ * init_css_set refcnting.  init_css_set never goes away and we can't race
+ * with migration path - either PF_EXITING is visible to migration path or
+ * @tsk never got on the tasklist.
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
@@ -4377,20 +4332,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	int i;
 
 	/*
-	 * Unlink from the css_set task list if necessary.  Optimistically
-	 * check cg_list before taking css_set_rwsem.
+	 * Unlink from @tsk from its css_set.  As migration path can't race
+	 * with us, we can check cg_list without grabbing css_set_rwsem.
 	 */
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
-		if (!list_empty(&tsk->cg_list)) {
-			list_del_init(&tsk->cg_list);
-			put_cset = true;
-		}
+		list_del_init(&tsk->cg_list);
 		up_write(&css_set_rwsem);
+		put_cset = true;
 	}
 
 	/* Reassign the task to the init_css_set. */
-	task_lock(tsk);
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
@@ -4405,7 +4357,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 			}
 		}
 	}
-	task_unlock(tsk);
 
 	if (put_cset)
 		put_css_set(cset, true);
-- 
cgit v1.3-14-g43fede


From 952aaa125428fae883670a2c2e40ea8044ca1eaa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Feb 2014 10:04:03 -0500
Subject: cgroup: update cgroup_transfer_tasks() to either succeed or fail

cgroup_transfer_tasks() can currently fail in the middle due to memory
allocation failure.  When that happens, the function just aborts and
returns error code and there's no way to tell how many actually got
migrated at the point of failure and or to revert the partial
migration.

Update it to use cgroup_migrate{_add_src|prepare_dst|migrate|finish}()
so that the function either succeeds or fails as a whole as long as
->can_attach() doesn't fail.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f783af900208..306ad0ed19ef 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2810,10 +2810,28 @@ void css_task_iter_end(struct css_task_iter *it)
  */
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 {
+	LIST_HEAD(preloaded_csets);
+	struct cgrp_cset_link *link;
 	struct css_task_iter it;
 	struct task_struct *task;
-	int ret = 0;
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+
+	/* all tasks in @from are being moved, all csets are source */
+	down_read(&css_set_rwsem);
+	list_for_each_entry(link, &from->cset_links, cset_link)
+		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
+	up_read(&css_set_rwsem);
 
+	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
+	if (ret)
+		goto out_err;
+
+	/*
+	 * Migrate tasks one-by-one until @form is empty.  This fails iff
+	 * ->can_attach() fails.
+	 */
 	do {
 		css_task_iter_start(&from->dummy_css, &it);
 		task = css_task_iter_next(&it);
@@ -2822,13 +2840,13 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 		css_task_iter_end(&it);
 
 		if (task) {
-			mutex_lock(&cgroup_mutex);
-			ret = cgroup_attach_task(to, task, false);
-			mutex_unlock(&cgroup_mutex);
+			ret = cgroup_migrate(to, task, false);
 			put_task_struct(task);
 		}
 	} while (task && !ret);
-
+out_err:
+	cgroup_migrate_finish(&preloaded_csets);
+	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 
-- 
cgit v1.3-14-g43fede


From a60bed296ac67b9e2765646dec8e36e3b4d7c395 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Feb 2014 16:07:59 -0500
Subject: cgroup_freezer: document freezer_fork() subtleties

cgroup_subsys->fork() callback is special in that it's called outside
the usual cgroup locking and may race with on-going migration.
freezer_fork() currently doesn't consider such race condition;
however, it is still correct thanks to the fact that freeze_task() may
be called spuriously.

This is quite subtle.  Let's explain what's going on and add test to
detect racing and losing to task migration and skip freeze_task() in
such cases for documentation.

This doesn't make any behavior difference meaningful to userland.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
---
 kernel/cgroup_freezer.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 7201a637c405..2ea98b216bff 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -214,6 +214,16 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
 	}
 }
 
+/**
+ * freezer_fork - cgroup post fork callback
+ * @task: a task which has just been forked
+ *
+ * @task has just been created and should conform to the current state of
+ * the cgroup_freezer it belongs to.  This function may race against
+ * freezer_attach().  Losing to freezer_attach() means that we don't have
+ * to do anything as freezer_attach() will put @task into the appropriate
+ * state.
+ */
 static void freezer_fork(struct task_struct *task)
 {
 	struct freezer *freezer;
@@ -222,14 +232,26 @@ static void freezer_fork(struct task_struct *task)
 	freezer = task_freezer(task);
 
 	/*
-	 * The root cgroup is non-freezable, so we can skip the
-	 * following check.
+	 * The root cgroup is non-freezable, so we can skip locking the
+	 * freezer.  This is safe regardless of race with task migration.
+	 * If we didn't race or won, skipping is obviously the right thing
+	 * to do.  If we lost and root is the new cgroup, noop is still the
+	 * right thing to do.
 	 */
 	if (!parent_freezer(freezer))
 		goto out;
 
+	/*
+	 * Grab @freezer->lock and freeze @task after verifying @task still
+	 * belongs to @freezer and it's freezing.  The former is for the
+	 * case where we have raced against task migration and lost and
+	 * @task is already in a different cgroup which may not be frozen.
+	 * This isn't strictly necessary as freeze_task() is allowed to be
+	 * called spuriously but let's do it anyway for, if nothing else,
+	 * documentation.
+	 */
 	spin_lock_irq(&freezer->lock);
-	if (freezer->state & CGROUP_FREEZING)
+	if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
 		freeze_task(task);
 	spin_unlock_irq(&freezer->lock);
 out:
-- 
cgit v1.3-14-g43fede


From fff421580f512fc044cc7421fdff31a7a6997350 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Jan 2014 20:20:43 -0800
Subject: timers: Track total number of timers in list

Currently, the tvec_base structure's ->active_timers field tracks only
the non-deferrable timers, which means that even if ->active_timers is
zero, there might well be deferrable timers in the list.  This commit
therefore adds an ->all_timers field to track all the timers, whether
deferrable or not.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Mike Galbraith <bitbucket@online.de>
---
 kernel/timer.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index accfd241b9e5..fdc43834f3af 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -81,6 +81,7 @@ struct tvec_base {
 	unsigned long timer_jiffies;
 	unsigned long next_timer;
 	unsigned long active_timers;
+	unsigned long all_timers;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -392,6 +393,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 			base->next_timer = timer->expires;
 		base->active_timers++;
 	}
+	base->all_timers++;
 }
 
 #ifdef CONFIG_TIMER_STATS
@@ -671,6 +673,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 	detach_timer(timer, true);
 	if (!tbase_get_deferrable(timer->base))
 		base->active_timers--;
+	base->all_timers--;
 }
 
 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -685,6 +688,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		if (timer->expires == base->next_timer)
 			base->next_timer = base->timer_jiffies;
 	}
+	base->all_timers--;
 	return 1;
 }
 
@@ -1559,6 +1563,7 @@ static int init_timers_cpu(int cpu)
 	base->timer_jiffies = jiffies;
 	base->next_timer = base->timer_jiffies;
 	base->active_timers = 0;
+	base->all_timers = 0;
 	return 0;
 }
 
-- 
cgit v1.3-14-g43fede


From d550e81dc0ddc04f1b417c179c214103a28e0ee8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 Dec 2013 05:57:10 -0800
Subject: timers: Reduce __run_timers() latency for empty list

The __run_timers() function currently steps through the list one jiffy at
a time in order to update the timer wheel.  However, if the timer wheel
is empty, no adjustment is needed other than updating ->timer_jiffies.
In this case, which is likely to be common for NO_HZ_FULL kernels, the
kernel currently incurs a large latency for no good reason.  This commit
therefore short-circuits this case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Mike Galbraith <bitbucket@online.de>
---
 kernel/timer.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index fdc43834f3af..c8bc7091d8f3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -338,6 +338,20 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
+/*
+ * If the list is empty, catch up ->timer_jiffies to the current time.
+ * The caller must hold the tvec_base lock.  Returns true if the list
+ * was empty and therefore ->timer_jiffies was updated.
+ */
+static bool catchup_timer_jiffies(struct tvec_base *base)
+{
+	if (!base->all_timers) {
+		base->timer_jiffies = jiffies;
+		return true;
+	}
+	return false;
+}
+
 static void
 __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
@@ -1150,6 +1164,10 @@ static inline void __run_timers(struct tvec_base *base)
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
+	if (catchup_timer_jiffies(base)) {
+		spin_unlock_irq(&base->lock);
+		return;
+	}
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list;
 		struct list_head *head = &work_list;
-- 
cgit v1.3-14-g43fede


From 16d937f880312e3f47157d4d6d6ebf7e61523378 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 Dec 2013 10:32:01 -0800
Subject: timers: Reduce future __run_timers() latency for newly emptied list

The __run_timers() function currently steps through the list one jiffy at
a time in order to update the timer wheel.  However, if the timer wheel
is empty, no adjustment is needed other than updating ->timer_jiffies.
Therefore, if we just emptied the timer wheel, for example, by deleting
the last timer, we should mark the timer wheel as being up to date.
This marking will reduce (and perhaps eliminate) the jiffy-stepping that
a future __run_timers() call will need to do in response to some future
timer posting or migration.  This commit therefore catches ->timer_jiffies
for this case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Mike Galbraith <bitbucket@online.de>
---
 kernel/timer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index c8bc7091d8f3..dfac34f7186f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -688,6 +688,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 	if (!tbase_get_deferrable(timer->base))
 		base->active_timers--;
 	base->all_timers--;
+	(void)catchup_timer_jiffies(base);
 }
 
 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -703,6 +704,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 			base->next_timer = base->timer_jiffies;
 	}
 	base->all_timers--;
+	(void)catchup_timer_jiffies(base);
 	return 1;
 }
 
-- 
cgit v1.3-14-g43fede


From 18d8cb64c9c074cbe2bd677ab10fff8283abdb62 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 16 Dec 2013 10:41:50 -0800
Subject: timers: Reduce future __run_timers() latency for first add to empty
 list

The __run_timers() function currently steps through the list one jiffy at
a time in order to update the timer wheel.  However, if the timer wheel
is empty, no adjustment is needed other than updating ->timer_jiffies.
Therefore, just before we add a timer to an empty timer wheel, we should
mark the timer wheel as being up to date.  This marking will reduce (and
perhaps eliminate) the jiffy-stepping that a future __run_timers() call
will need to do in response to some future timer posting or migration.
This commit therefore updates ->timer_jiffies for this case.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Mike Galbraith <bitbucket@online.de>
---
 kernel/timer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index dfac34f7186f..0c638cf3d9d2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -398,6 +398,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
+	(void)catchup_timer_jiffies(base);
 	__internal_add_timer(base, timer);
 	/*
 	 * Update base->active_timers and base->next_timer
-- 
cgit v1.3-14-g43fede


From aea369b959bef10d235cd0714789cd8b0fe170b8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 15 Jan 2014 16:19:27 -0800
Subject: timers: Make internal_add_timer() update ->next_timer if
 ->active_timers == 0

The internal_add_timer() function updates base->next_timer only if
timer->expires < base->next_timer. This is correct, but it also makes
sense to do the same if we add the first non-deferrable timer.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Mike Galbraith <bitbucket@online.de>
---
 kernel/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 0c638cf3d9d2..c0d8898fed98 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,9 +404,9 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * Update base->active_timers and base->next_timer
 	 */
 	if (!tbase_get_deferrable(timer->base)) {
-		if (time_before(timer->expires, base->next_timer))
+		if (!base->active_timers++ ||
+		    time_before(timer->expires, base->next_timer))
 			base->next_timer = timer->expires;
-		base->active_timers++;
 	}
 	base->all_timers++;
 }
-- 
cgit v1.3-14-g43fede


From d498d4b47fb3050f2f7840cc49251f87f04d1ca9 Mon Sep 17 00:00:00 2001
From: Vijaya Kumar K <Vijaya.Kumar@caviumnetworks.com>
Date: Tue, 28 Jan 2014 16:50:20 +0530
Subject: KGDB: make kgdb_breakpoint() as noinline

The function kgdb_breakpoint() sets up break point at
compile time by calling arch_kgdb_breakpoint();
Though this call is surrounded by wmb() barrier,
the compile can still re-order the break point,
because this scheduling barrier is not a code motion
barrier in gcc.

Making kgdb_breakpoint() as noinline solves this problem
of code reording around break point instruction and also
avoids problem of being called as inline function from
other places

More details about discussion on this can be found here
http://comments.gmane.org/gmane.linux.ports.arm.kernel/269732

Signed-off-by: Vijaya Kumar K <Vijaya.Kumar@caviumnetworks.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Acked-by: Jason Wessel <jason.wessel@windriver.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/debug/debug_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 334b3980ffc1..99982a70ddad 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -1035,7 +1035,7 @@ int dbg_io_get_char(void)
  * otherwise as a quick means to stop program execution and "break" into
  * the debugger.
  */
-void kgdb_breakpoint(void)
+noinline void kgdb_breakpoint(void)
 {
 	atomic_inc(&kgdb_setting_breakpoint);
 	wmb(); /* Sync point before breakpoint */
-- 
cgit v1.3-14-g43fede


From 8857563b819b140aa8c9be920cfe44d5d3f808b7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 12 Feb 2014 13:02:22 -0800
Subject: notifier: Substitute rcu_access_pointer() for rcu_dereference_raw()

(Trivial patch.)

If the code is looking at the RCU-protected pointer itself, but not
dereferencing it, the rcu_dereference() functions can be downgraded
to rcu_access_pointer().  This commit makes this downgrade in
__blocking_notifier_call_chain() which simply compares the RCU-protected
pointer against NULL with no dereferencing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 kernel/notifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7f..db4c8b08a50c 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
 	 * racy then it does not matter what the result of the test
 	 * is, we re-check the list after having taken the lock anyway:
 	 */
-	if (rcu_dereference_raw(nh->head)) {
+	if (rcu_access_pointer(nh->head)) {
 		down_read(&nh->rwsem);
 		ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
 					nr_calls);
-- 
cgit v1.3-14-g43fede


From 7a754743185a4b05818e10058fa2fbe4e6969085 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Tue, 11 Feb 2014 16:10:12 -0500
Subject: rcu: Fix sparse warning for rcu_expedited from kernel/ksysfs.c

This commit fixes the follwoing warning:

kernel/ksysfs.c:143:5: warning: symbol 'rcu_expedited' was not declared. Should it be static?

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
[ paulmck: Moved the declaration to include/linux/rcupdate.h to avoid
	   including the RCU-internal rcu.h file outside of RCU. ]
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h | 1 +
 kernel/ksysfs.c          | 2 ++
 kernel/rcu/rcu.h         | 2 --
 3 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 32decf1a9c6c..f3706c6b2e21 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -46,6 +46,7 @@
 #include <linux/compiler.h>
 #include <asm/barrier.h>
 
+extern int rcu_expedited; /* for sysctl */
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable; /* for sysctl */
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760f..e660964086e2 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,8 @@
 #include <linux/sched.h>
 #include <linux/capability.h>
 
+#include <linux/rcupdate.h>	/* rcu_expedited */
+
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 1bd787fddcb2..af2e60a8425d 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -116,8 +116,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
 	}
 }
 
-extern int rcu_expedited;
-
 #ifdef CONFIG_RCU_STALL_COMMON
 
 extern int rcu_cpu_stall_suppress;
-- 
cgit v1.3-14-g43fede


From 5cb5c6e18f822b19bd41a2c0f9930c82b3ec0bc9 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 19 Feb 2014 14:33:27 -0500
Subject: rcu: Ensure kernel/rcu/rcu.h can be sourced/used stand-alone

The kbuild test bot uncovered an implicit dependence on the
trace header being present before rcu.h in ia64 allmodconfig
that looks like this:

In file included from kernel/ksysfs.c:22:0:
kernel/rcu/rcu.h: In function '__rcu_reclaim':
kernel/rcu/rcu.h:107:3: error: implicit declaration of function 'trace_rcu_invoke_kfree_callback' [-Werror=implicit-function-declaration]
kernel/rcu/rcu.h:112:3: error: implicit declaration of function 'trace_rcu_invoke_callback' [-Werror=implicit-function-declaration]
cc1: some warnings being treated as errors

Looking at other rcu.h users, we can find that they all
were sourcing the trace header in advance of rcu.h itself,
as seen in the context of this diff.  There were also some
inconsistencies as to whether it was or wasn't sourced based
on the parent tracing Kconfig.

Rather than "fix" it at each use site, and have inconsistent
use based on whether "#ifdef CONFIG_RCU_TRACE" was used or not,
lets just source the trace header just once, in the actual consumer
of it, which is rcu.h itself.  We include it unconditionally, as
build testing shows us that is a hard requirement for some files.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcu.h    | 1 +
 kernel/rcu/srcu.c   | 2 --
 kernel/rcu/tiny.c   | 4 ----
 kernel/rcu/tree.c   | 2 --
 kernel/rcu/update.c | 1 -
 5 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index af2e60a8425d..bfda2726ca45 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -23,6 +23,7 @@
 #ifndef __LINUX_RCU_H
 #define __LINUX_RCU_H
 
+#include <trace/events/rcu.h>
 #ifdef CONFIG_RCU_TRACE
 #define RCU_TRACE(stmt) stmt
 #else /* #ifdef CONFIG_RCU_TRACE */
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 2359779e1daa..c639556f3fa0 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -36,8 +36,6 @@
 #include <linux/delay.h>
 #include <linux/srcu.h>
 
-#include <trace/events/rcu.h>
-
 #include "rcu.h"
 
 /*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 53b95bbf4abb..d9efcc13008c 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -37,10 +37,6 @@
 #include <linux/prefetch.h>
 #include <linux/ftrace_event.h>
 
-#ifdef CONFIG_RCU_TRACE
-#include <trace/events/rcu.h>
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
 #include "rcu.h"
 
 /* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 73c3cd2b87ac..c7ed5db2dd79 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -58,8 +58,6 @@
 #include <linux/suspend.h>
 
 #include "tree.h"
-#include <trace/events/rcu.h>
-
 #include "rcu.h"
 
 MODULE_ALIAS("rcutree");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index fd0d5b5b8e7c..4c0a9b0af469 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -49,7 +49,6 @@
 #include <linux/module.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
 
 #include "rcu.h"
 
-- 
cgit v1.3-14-g43fede


From 5edb93b89f6cc3089ee283656555e7a9ad36a8a0 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Tue, 4 Feb 2014 19:32:28 -0800
Subject: resource: Add resource_contains()

We have two identical copies of resource_contains() already, and more
places that could use it.  This moves it to ioport.h where it can be
shared.

resource_contains(struct resource *r1, struct resource *r2) returns true
iff r1 and r2 are the same type (most callers already checked this
separately) and the r1 address range completely contains r2.

In addition, the new resource_contains() checks that both r1 and r2 have
addresses assigned to them.  If a resource is IORESOURCE_UNSET, it doesn't
have a valid address and can't contain or be contained by another resource.
Some callers already check this or for res->start.

No functional change.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/host-bridge.c |  8 --------
 include/linux/ioport.h    | 10 ++++++++++
 kernel/resource.c         |  8 ++------
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index 06ace6248c61..47aaf22d814e 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -32,11 +32,6 @@ void pci_set_host_bridge_release(struct pci_host_bridge *bridge,
 	bridge->release_data = release_data;
 }
 
-static bool resource_contains(struct resource *res1, struct resource *res2)
-{
-	return res1->start <= res2->start && res1->end >= res2->end;
-}
-
 void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region,
 			     struct resource *res)
 {
@@ -45,9 +40,6 @@ void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region,
 	resource_size_t offset = 0;
 
 	list_for_each_entry(window, &bridge->windows, list) {
-		if (resource_type(res) != resource_type(window->res))
-			continue;
-
 		if (resource_contains(window->res, res)) {
 			offset = window->offset;
 			break;
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 89b7c24a36e9..9fcaac8bc4f6 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -169,6 +169,16 @@ static inline unsigned long resource_type(const struct resource *res)
 {
 	return res->flags & IORESOURCE_TYPE_BITS;
 }
+/* True iff r1 completely contains r2 */
+static inline bool resource_contains(struct resource *r1, struct resource *r2)
+{
+	if (resource_type(r1) != resource_type(r2))
+		return false;
+	if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET)
+		return false;
+	return r1->start <= r2->start && r1->end >= r2->end;
+}
+
 
 /* Convenience shorthand with allocation */
 #define request_region(start,n,name)		__request_region(&ioport_resource, (start), (n), (name), 0)
diff --git a/kernel/resource.c b/kernel/resource.c
index 3f285dce9347..a8344dda7049 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -432,11 +432,6 @@ static void resource_clip(struct resource *res, resource_size_t min,
 		res->end = max;
 }
 
-static bool resource_contains(struct resource *res1, struct resource *res2)
-{
-	return res1->start <= res2->start && res1->end >= res2->end;
-}
-
 /*
  * Find empty slot in the resource tree with the given range and
  * alignment constraints
@@ -471,10 +466,11 @@ static int __find_resource(struct resource *root, struct resource *old,
 		arch_remove_reservations(&tmp);
 
 		/* Check for overflow after ALIGN() */
-		avail = *new;
 		avail.start = ALIGN(tmp.start, constraint->align);
 		avail.end = tmp.end;
+		avail.flags = new->flags & ~IORESOURCE_UNSET;
 		if (avail.start >= tmp.start) {
+			alloc.flags = avail.flags;
 			alloc.start = constraint->alignf(constraint->alignf_data, &avail,
 					size, constraint->align);
 			alloc.end = alloc.start + size - 1;
-- 
cgit v1.3-14-g43fede


From c685689fd24d310343ac33942e9a54a974ae9c43 Mon Sep 17 00:00:00 2001
From: Chuansheng Liu <chuansheng.liu@intel.com>
Date: Mon, 24 Feb 2014 11:29:50 +0800
Subject: genirq: Remove racy waitqueue_active check

We hit one rare case below:

T1 calling disable_irq(), but hanging at synchronize_irq()
always;
The corresponding irq thread is in sleeping state;
And all CPUs are in idle state;

After analysis, we found there is one possible scenerio which
causes T1 is waiting there forever:
CPU0                                       CPU1
 synchronize_irq()
  wait_event()
    spin_lock()
                                           atomic_dec_and_test(&threads_active)
      insert the __wait into queue
    spin_unlock()
                                           if(waitqueue_active)
    atomic_read(&threads_active)
                                             wake_up()

Here after inserted the __wait into queue on CPU0, and before
test if queue is empty on CPU1, there is no barrier, it maybe
cause it is not visible for CPU1 immediately, although CPU0 has
updated the queue list.
It is similar for CPU0 atomic_read() threads_active also.

So we'd need one smp_mb() before waitqueue_active.that, but removing
the waitqueue_active() check solves it as wel l and it makes
things simple and clear.

Signed-off-by: Chuansheng Liu <chuansheng.liu@intel.com>
Cc: Xiaoming Wang <xiaoming.wang@intel.com>
Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com
Cc: stable@vger.kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b17..d3bf660cb57f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
 
 static void wake_threads_waitq(struct irq_desc *desc)
 {
-	if (atomic_dec_and_test(&desc->threads_active) &&
-	    waitqueue_active(&desc->wait_for_threads))
+	if (atomic_dec_and_test(&desc->threads_active))
 		wake_up(&desc->wait_for_threads);
 }
 
-- 
cgit v1.3-14-g43fede


From 791c9e0292671a3bfa95286bb5c08129d8605618 Mon Sep 17 00:00:00 2001
From: George McCollister <george.mccollister@gmail.com>
Date: Tue, 18 Feb 2014 17:56:51 -0600
Subject: sched: Fix double normalization of vruntime

dequeue_entity() is called when p->on_rq and sets se->on_rq = 0
which appears to guarentee that the !se->on_rq condition is met.
If the task has done set_current_state(TASK_INTERRUPTIBLE) without
schedule() the second condition will be met and vruntime will be
incorrectly adjusted twice.

In certain cases this can result in the task's vruntime never increasing
past the vruntime of other tasks on the CFS' run queue, starving them of
CPU time.

This patch changes switched_from_fair() to use !p->on_rq instead of
!se->on_rq.

I'm able to cause a task with a priority of 120 to starve all other
tasks with the same priority on an ARM platform running 3.2.51-rt72
PREEMPT RT by writing one character at time to a serial tty (16550 UART)
in a tight loop. I'm also able to verify making this change corrects the
problem on that platform and kernel version.

Signed-off-by: George McCollister <george.mccollister@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 78157099b167..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7001,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
 	/*
-	 * Ensure the task's vruntime is normalized, so that when its
+	 * Ensure the task's vruntime is normalized, so that when it's
 	 * switched back to the fair class the enqueue_entity(.flags=0) will
 	 * do the right thing.
 	 *
-	 * If it was on_rq, then the dequeue_entity(.flags=0) will already
-	 * have normalized the vruntime, if it was !on_rq, then only when
+	 * If it's on_rq, then the dequeue_entity(.flags=0) will already
+	 * have normalized the vruntime, if it's !on_rq, then only when
 	 * the task is sleeping will it still have non-normalized vruntime.
 	 */
-	if (!se->on_rq && p->state != TASK_RUNNING) {
+	if (!p->on_rq && p->state != TASK_RUNNING) {
 		/*
 		 * Fix up our vruntime so that the current sleep doesn't
 		 * cause 'unlimited' sleep bonus.
-- 
cgit v1.3-14-g43fede


From 3908ac13b550c93f97d8db136ff572be5495bc06 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Tue, 25 Feb 2014 19:52:23 +0400
Subject: sched/deadline: Cleanup RT leftovers from {inc/dec}_dl_migration

In deadline class we do not have group scheduling.

So, let's remove unnecessary

	X = X;

equations.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Link: http://lkml.kernel.org/r/1393343543.4089.5.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 15cbc17fbf84..aecf93030e0b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -135,7 +135,6 @@ static void update_dl_migration(struct dl_rq *dl_rq)
 static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
-	dl_rq = &rq_of_dl_rq(dl_rq)->dl;
 
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory++;
@@ -146,7 +145,6 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
 {
 	struct task_struct *p = dl_task_of(dl_se);
-	dl_rq = &rq_of_dl_rq(dl_rq)->dl;
 
 	if (p->nr_cpus_allowed > 1)
 		dl_rq->dl_nr_migratory--;
-- 
cgit v1.3-14-g43fede


From eec751ed41a0ae7e92a43c33a458d7bd1b941631 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Mon, 24 Feb 2014 11:47:12 +0100
Subject: sched/deadline: Switch CPU's presence test order

Commit 82b9580 ("sched/deadline: Test for CPU's presence explicitly")
changed how we check if a CPU returned by cpudeadline machinery is
valid. But, we don't want to call cpu_present() if best_cpu is
equal to -1. So, switch the order of tests inside WARN_ON().

Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: boris.ostrovsky@oracle.com
Cc: konrad.wilk@oracle.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1393238832-9100-1-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/cpudeadline.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b8838b56d1c..5b9bb42b2d47 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
 
 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
 {
-	WARN_ON(!cpu_present(idx) || idx == IDX_INVALID);
+	WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
 
 	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
 		cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 	}
 
 out:
-	WARN_ON(!cpu_present(best_cpu) && best_cpu != -1);
+	WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
 
 	return best_cpu;
 }
-- 
cgit v1.3-14-g43fede


From faa5993736d9b44b508cab4f1f3a77d66641c6f4 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Fri, 21 Feb 2014 11:37:15 +0100
Subject: sched/deadline: Prevent rt_time growth to infinity

Kirill Tkhai noted:

  Since deadline tasks share rt bandwidth, we must care about
  bandwidth timer set. Otherwise rt_time may grow up to infinity
  in update_curr_dl(), if there are no other available RT tasks
  on top level bandwidth.

RT task were in fact throttled right after they got enqueued,
and never executed again (rt_time never again went below rt_runtime).

Peter then proposed to accrue DL execution on rt_time only when
rt timer is active, and proposed a patch (this patch is a slight
modification of that) to implement that behavior. While this
solves Kirill problem, it has a drawback.

Indeed, Kirill noted again:

  It looks we may get into a situation, when all CPU time is shared
  between RT and DL tasks:

  rt_runtime = n
  rt_period  = 2n

  | RT working, DL sleeping  | DL working, RT sleeping      |
  -----------------------------------------------------------
  | (1)     duration = n     | (2)     duration = n         | (repeat)
  |--------------------------|------------------------------|
  | (rt_bw timer is running) | (rt_bw timer is not running) |

  No time for fair tasks at all.

While this can happen during the first period, if rq is always backlogged,
RT tasks won't have the opportunity to execute anymore: rt_time reached
rt_runtime during (1), suppose after (2) RT is enqueued back, it gets
throttled since rt timer didn't fire, replenishment is from now on eaten up
by DL tasks that accrue their execution on rt_time (while rt timer is
active - we have an RT task waiting for replenishment). FAIR tasks are
not touched after this first period. Ok, this is not ideal, and the situation
is even worse!

What above (the nice case), practically never happens in reality, where
your rt timer is not aligned to tasks periods, tasks are in general not
periodic, etc.. Long story short, you always risk to overload your system.

This patch is based on Peter's idea, but exploits an additional fact:
if you don't have RT tasks enqueued, it makes little sense to continue
incrementing rt_time once you reached the upper limit (DL tasks have their
own mechanism for throttling).

This cures both problems:

 - no matter how many DL instances in the past, you'll have an rt_time
   slightly above rt_runtime when an RT task is enqueued, and from that
   point on (after the first replenishment), the task will normally execute;

 - you can still eat up all bandwidth during the first period, but not
   anymore after that, remember that DL execution will increment rt_time
   till the upper limit is reached.

The situation is still not perfect! But, we have a simple solution for now,
that limits how much you can jeopardize your system, as we keep working
towards the right answer: RT groups scheduled using deadline servers.

Reported-by: Kirill Tkhai <tkhai@yandex.ru>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20140225151515.617714e2f2cd6c558531ba61@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 8 ++++++--
 kernel/sched/rt.c       | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aecf93030e0b..6e79b3faa4cd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -562,6 +562,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
 	return 1;
 }
 
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
 /*
  * Update the current task's runtime statistics (provided it is still
  * a -deadline task and has not been removed from the dl_rq).
@@ -625,11 +627,13 @@ static void update_curr_dl(struct rq *rq)
 		struct rt_rq *rt_rq = &rq->rt;
 
 		raw_spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_time += delta_exec;
 		/*
 		 * We'll let actual RT tasks worry about the overflow here, we
-		 * have our own CBS to keep us inline -- see above.
+		 * have our own CBS to keep us inline; only account when RT
+		 * bandwidth is relevant.
 		 */
+		if (sched_rt_bandwidth_account(rt_rq))
+			rt_rq->rt_time += delta_exec;
 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..1999021042c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+	return (hrtimer_active(&rt_b->rt_period_timer) ||
+		rt_rq->rt_time < rt_b->rt_runtime);
+}
+
 #ifdef CONFIG_SMP
 /*
  * We ran out of runtime, see if we can borrow some from our neighbours.
-- 
cgit v1.3-14-g43fede


From e3703f8cdfcf39c25c4338c3ad8e68891cca3731 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Feb 2014 12:06:12 +0100
Subject: perf: Fix hotplug splat

Drew Richardson reported that he could make the kernel go *boom* when hotplugging
while having perf events active.

It turned out that when you have a group event, the code in
__perf_event_exit_context() fails to remove the group siblings from
the context.

We then proceed with destroying and freeing the event, and when you
re-plug the CPU and try and add another event to that CPU, things go
*boom* because you've still got dead entries there.

Reported-by: Drew Richardson <drew.richardson@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/n/tip-k6v5wundvusvcseqj1si0oz0@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd3..fa0b2d4ad83c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7856,14 +7856,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 static void __perf_event_exit_context(void *__info)
 {
 	struct perf_event_context *ctx = __info;
-	struct perf_event *event, *tmp;
+	struct perf_event *event;
 
 	perf_pmu_rotate_stop(ctx->pmu);
 
-	list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-		__perf_remove_from_context(event);
-	list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
+	rcu_read_lock();
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
 		__perf_remove_from_context(event);
+	rcu_read_unlock();
 }
 
 static void perf_event_exit_cpu_context(int cpu)
@@ -7887,11 +7887,11 @@ static void perf_event_exit_cpu(int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
+	perf_event_exit_cpu_context(cpu);
+
 	mutex_lock(&swhash->hlist_mutex);
 	swevent_hlist_release(swhash);
 	mutex_unlock(&swhash->hlist_mutex);
-
-	perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
-- 
cgit v1.3-14-g43fede


From f5f9739d7a0ccbdcf913a0b3604b134129d14f7e Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <dietmar.eggemann@arm.com>
Date: Wed, 26 Feb 2014 11:19:33 +0000
Subject: sched: Put rq's sched_avg under CONFIG_FAIR_GROUP_SCHED

The struct sched_avg of struct rq is only used in case group
scheduling is enabled inside __update_tg_runnable_avg() to update
per-cpu representation of a task group.  I.e. that there is no need to
maintain the runnable avg of a rq in the !CONFIG_FAIR_GROUP_SCHED case.

This patch guards struct sched_avg of struct rq and
update_rq_runnable_avg() with CONFIG_FAIR_GROUP_SCHED.

There is an extra empty definition for update_rq_runnable_avg()
necessary for the !CONFIG_FAIR_GROUP_SCHED && CONFIG_SMP case.

The function print_cfs_group_stats() which prints out struct sched_avg
of struct rq is already guarded with CONFIG_FAIR_GROUP_SCHED.

Reviewed-by: Ben Segall <bsegall@google.com>
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/530DCDC5.1060406@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  | 13 +++++++------
 kernel/sched/sched.h |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a3a41c61a2c9..be4f7d9eaf03 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2374,12 +2374,19 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
 		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
 	}
 }
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
 						 int force_update) {}
 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
 						  struct cfs_rq *cfs_rq) {}
 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 static inline void __update_task_entity_contrib(struct sched_entity *se)
@@ -2478,12 +2485,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
 	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
 }
 
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
-	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
-	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
-
 /* Add the load generated by se into cfs_rq's child load-average */
 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 						  struct sched_entity *se,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d608125b36ef..046084ebb1fb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -541,6 +541,8 @@ struct rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
+
+	struct sched_avg avg;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	/*
@@ -630,8 +632,6 @@ struct rq {
 #ifdef CONFIG_SMP
 	struct llist_head wake_list;
 #endif
-
-	struct sched_avg avg;
 };
 
 static inline int cpu_of(struct rq *rq)
-- 
cgit v1.3-14-g43fede


From 06d50c65b1043b166d102accc081093f79d8f7e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Feb 2014 18:22:07 +0100
Subject: sched/idle: Remove stale old file

Commit cf37b6b48428d ("sched/idle: Move cpu/idle.c to sched/idle.c")
said to simply move a file; somehow it got mangled and created an old
version of the file and forgot to remove the old file.

Fix this fail; add the lost change and remove the now identical old
file.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: rjw@rjwysocki.net
Cc: nicolas.pitre@linaro.org
Cc: preeti@linux.vnet.ibm.com
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Link: http://lkml.kernel.org/r/20140224172207.GC9987@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/cpu/idle.c   | 147 ----------------------------------------------------
 kernel/sched/idle.c |  17 +++---
 2 files changed, 10 insertions(+), 154 deletions(-)
 delete mode 100644 kernel/cpu/idle.c

(limited to 'kernel')

diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
deleted file mode 100644
index b7976a127178..000000000000
--- a/kernel/cpu/idle.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Generic entry point for the idle threads
- */
-#include <linux/sched.h>
-#include <linux/cpu.h>
-#include <linux/cpuidle.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
-
-#include <asm/tlb.h>
-
-#include <trace/events/power.h>
-
-static int __read_mostly cpu_idle_force_poll;
-
-void cpu_idle_poll_ctrl(bool enable)
-{
-	if (enable) {
-		cpu_idle_force_poll++;
-	} else {
-		cpu_idle_force_poll--;
-		WARN_ON_ONCE(cpu_idle_force_poll < 0);
-	}
-}
-
-#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
-static int __init cpu_idle_poll_setup(char *__unused)
-{
-	cpu_idle_force_poll = 1;
-	return 1;
-}
-__setup("nohlt", cpu_idle_poll_setup);
-
-static int __init cpu_idle_nopoll_setup(char *__unused)
-{
-	cpu_idle_force_poll = 0;
-	return 1;
-}
-__setup("hlt", cpu_idle_nopoll_setup);
-#endif
-
-static inline int cpu_idle_poll(void)
-{
-	rcu_idle_enter();
-	trace_cpu_idle_rcuidle(0, smp_processor_id());
-	local_irq_enable();
-	while (!tif_need_resched())
-		cpu_relax();
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
-	rcu_idle_exit();
-	return 1;
-}
-
-/* Weak implementations for optional arch specific functions */
-void __weak arch_cpu_idle_prepare(void) { }
-void __weak arch_cpu_idle_enter(void) { }
-void __weak arch_cpu_idle_exit(void) { }
-void __weak arch_cpu_idle_dead(void) { }
-void __weak arch_cpu_idle(void)
-{
-	cpu_idle_force_poll = 1;
-	local_irq_enable();
-}
-
-/*
- * Generic idle loop implementation
- */
-static void cpu_idle_loop(void)
-{
-	while (1) {
-		tick_nohz_idle_enter();
-
-		while (!need_resched()) {
-			check_pgt_cache();
-			rmb();
-
-			if (cpu_is_offline(smp_processor_id()))
-				arch_cpu_idle_dead();
-
-			local_irq_disable();
-			arch_cpu_idle_enter();
-
-			/*
-			 * In poll mode we reenable interrupts and spin.
-			 *
-			 * Also if we detected in the wakeup from idle
-			 * path that the tick broadcast device expired
-			 * for us, we don't want to go deep idle as we
-			 * know that the IPI is going to arrive right
-			 * away
-			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
-				cpu_idle_poll();
-			} else {
-				if (!current_clr_polling_and_test()) {
-					stop_critical_timings();
-					rcu_idle_enter();
-					if (cpuidle_idle_call())
-						arch_cpu_idle();
-					if (WARN_ON_ONCE(irqs_disabled()))
-						local_irq_enable();
-					rcu_idle_exit();
-					start_critical_timings();
-				} else {
-					local_irq_enable();
-				}
-				__current_set_polling();
-			}
-			arch_cpu_idle_exit();
-		}
-
-		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
-		 */
-		preempt_set_need_resched();
-		tick_nohz_idle_exit();
-		schedule_preempt_disabled();
-	}
-}
-
-void cpu_startup_entry(enum cpuhp_state state)
-{
-	/*
-	 * This #ifdef needs to die, but it's too late in the cycle to
-	 * make this generic (arm and sh have never invoked the canary
-	 * init for the non boot cpus!). Will be fixed in 3.11
-	 */
-#ifdef CONFIG_X86
-	/*
-	 * If we're the non-boot CPU, nothing set the stack canary up
-	 * for us. The boot CPU already has it initialized but no harm
-	 * in doing it again. This is a good place for updating it, as
-	 * we wont ever return from this function (so the invalid
-	 * canaries already on the stack wont ever trigger).
-	 */
-	boot_init_stack_canary();
-#endif
-	__current_set_polling();
-	arch_cpu_idle_prepare();
-	cpu_idle_loop();
-}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 14ca43430aee..b7976a127178 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -108,14 +108,17 @@ static void cpu_idle_loop(void)
 				__current_set_polling();
 			}
 			arch_cpu_idle_exit();
-			/*
-			 * We need to test and propagate the TIF_NEED_RESCHED
-			 * bit here because we might not have send the
-			 * reschedule IPI to idle tasks.
-			 */
-			if (tif_need_resched())
-				set_preempt_need_resched();
 		}
+
+		/*
+		 * Since we fell out of the loop above, we know
+		 * TIF_NEED_RESCHED must be set, propagate it into
+		 * PREEMPT_NEED_RESCHED.
+		 *
+		 * This is required because for polling idle loops we will
+		 * not have had an IPI to fold the state for us.
+		 */
+		preempt_set_need_resched();
 		tick_nohz_idle_exit();
 		schedule_preempt_disabled();
 	}
-- 
cgit v1.3-14-g43fede


From 37e117c07b89194aae7062bc63bde1104c03db02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 14 Feb 2014 12:25:08 +0100
Subject: sched: Guarantee task priority in pick_next_task()

Michael spotted that the idle_balance() push down created a task
priority problem.

Previously, when we called idle_balance() before pick_next_task() it
wasn't a problem when -- because of the rq->lock droppage -- an rt/dl
task slipped in.

Similarly for pre_schedule(), rt pre-schedule could have a dl task
slip in.

But by pulling it into the pick_next_task() loop, we'll not try a
higher task priority again.

Cure this by creating a re-start condition in pick_next_task(); and
triggering this from pick_next_task_{rt,fair}().

It also fixes a live-lock where we get stuck in pick_next_task_fair()
due to idle_balance() seeing !0 nr_running but there not actually
being any fair tasks about.

Reported-by: Michael Wang <wangyun@linux.vnet.ibm.com>
Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/20140224121218.GR15586@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c  | 12 ++++++++----
 kernel/sched/fair.c  | 13 ++++++++++++-
 kernel/sched/rt.c    | 10 +++++++++-
 kernel/sched/sched.h |  5 +++++
 4 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a8a73b8897bf..cde573d3f12e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2586,24 +2586,28 @@ static inline void schedule_debug(struct task_struct *prev)
 static inline struct task_struct *
 pick_next_task(struct rq *rq, struct task_struct *prev)
 {
-	const struct sched_class *class;
+	const struct sched_class *class = &fair_sched_class;
 	struct task_struct *p;
 
 	/*
 	 * Optimization: we know that if all tasks are in
 	 * the fair class we can call that function directly:
 	 */
-	if (likely(prev->sched_class == &fair_sched_class &&
+	if (likely(prev->sched_class == class &&
 		   rq->nr_running == rq->cfs.h_nr_running)) {
 		p = fair_sched_class.pick_next_task(rq, prev);
-		if (likely(p))
+		if (likely(p && p != RETRY_TASK))
 			return p;
 	}
 
+again:
 	for_each_class(class) {
 		p = class->pick_next_task(rq, prev);
-		if (p)
+		if (p) {
+			if (unlikely(p == RETRY_TASK))
+				goto again;
 			return p;
+		}
 	}
 
 	BUG(); /* the idle class will always have a runnable task */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index be4f7d9eaf03..16042b58a32f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4686,6 +4686,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 	struct task_struct *p;
+	int new_tasks;
 
 again:
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -4784,7 +4785,17 @@ simple:
 	return p;
 
 idle:
-	if (idle_balance(rq)) /* drops rq->lock */
+	/*
+	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
+	 * possible for any higher priority task to appear. In that case we
+	 * must re-start the pick_next_entity() loop.
+	 */
+	new_tasks = idle_balance(rq);
+
+	if (rq->nr_running != rq->cfs.h_nr_running)
+		return RETRY_TASK;
+
+	if (new_tasks)
 		goto again;
 
 	return NULL;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4d4b386598aa..398b3f990823 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1360,8 +1360,16 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 	struct task_struct *p;
 	struct rt_rq *rt_rq = &rq->rt;
 
-	if (need_pull_rt_task(rq, prev))
+	if (need_pull_rt_task(rq, prev)) {
 		pull_rt_task(rq);
+		/*
+		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
+		 * means a dl task can slip in, in which case we need to
+		 * re-start task selection.
+		 */
+		if (unlikely(rq->dl.dl_nr_running))
+			return RETRY_TASK;
+	}
 
 	if (!rt_rq->rt_nr_running)
 		return NULL;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 046084ebb1fb..1929deb3f29d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1091,6 +1091,8 @@ static const u32 prio_to_wmult[40] = {
 
 #define DEQUEUE_SLEEP		1
 
+#define RETRY_TASK		((void *)-1UL)
+
 struct sched_class {
 	const struct sched_class *next;
 
@@ -1105,6 +1107,9 @@ struct sched_class {
 	 * It is the responsibility of the pick_next_task() method that will
 	 * return the next task to call put_prev_task() on the @prev task or
 	 * something equivalent.
+	 *
+	 * May return RETRY_TASK when it finds a higher prio class has runnable
+	 * tasks.
 	 */
 	struct task_struct * (*pick_next_task) (struct rq *rq,
 						struct task_struct *prev);
-- 
cgit v1.3-14-g43fede


From 2b3942e4bb20ef8ef26515bd958c2df83d9a6210 Mon Sep 17 00:00:00 2001
From: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Date: Mon, 24 Feb 2014 22:12:01 +0800
Subject: trace: Replace hardcoding of 19 with MAX_NICE

Use MAX_NICE instead of the value 19 for ring_buffer_benchmark.

Signed-off-by: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1393251121-25534-1-git-send-email-yangds.fnst@cn.fujitsu.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/ring_buffer_benchmark.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b98..0434ff1b808e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
 module_param(write_iteration, uint, 0644);
 MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
 
-static int producer_nice = 19;
-static int consumer_nice = 19;
+static int producer_nice = MAX_NICE;
+static int consumer_nice = MAX_NICE;
 
 static int producer_fifo = -1;
 static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
 
 	/* Let the user know that the test is running at low priority */
 	if (producer_fifo < 0 && consumer_fifo < 0 &&
-	    producer_nice == 19 && consumer_nice == 19)
+	    producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
 		trace_printk("WARNING!!! This test is running at lowest priority.\n");
 
 	trace_printk("Time:     %lld (usecs)\n", time);
-- 
cgit v1.3-14-g43fede


From 9e3170411ed171a126f4dca1672012a33efe59e5 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 10 Feb 2014 17:44:18 +0000
Subject: perf: Fix prototype of find_pmu_context()

For some reason find_pmu_context() is defined as returning void * rather
than a __percpu struct perf_cpu_context *. As all the requisite types are
defined in advance there's no reason to keep it that way.

This patch modifies the prototype of pmu_find_context to return a
__percpu struct perf_cpu_context *.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Dave Martin <Dave.Martin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Link: http://lkml.kernel.org/r/1392054264-23570-2-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index fa990061aa6c..425159882a6f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6313,7 +6313,7 @@ static int perf_event_idx_default(struct perf_event *event)
  * Ensures all contexts with the same task_ctx_nr have the same
  * pmu_cpu_context too.
  */
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
 {
 	struct pmu *pmu;
 
-- 
cgit v1.3-14-g43fede


From fdded676c3ef680bf1abc415d307d7e69a6768d1 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 10 Feb 2014 17:44:19 +0000
Subject: perf: Remove redundant PMU assignment

Currently perf_branch_stack_sched_in iterates over the set of pmus,
checks that each pmu has a flush_branch_stack callback, then overwrites
the pmu before calling the callback. This is either redundant or broken.

In systems with a single hw pmu, pmu == cpuctx->ctx.pmu, and thus the
assignment is redundant.

In systems with multiple hw pmus (i.e. multiple pmus with task_ctx_nr ==
perf_hw_context) the pmus share the same perf_cpu_context. Thus the
assignment can cause one of the pmus to flush its branch stack
repeatedly rather than causing each of the pmus to flush their branch
stacks. Worse still, if only some pmus have the callback the assignment
can result in a branch to NULL.

This patch removes the redundant assignment.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/r/1392054264-23570-3-git-send-email-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 425159882a6f..823a53d72d6a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2582,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
 		if (cpuctx->ctx.nr_branch_stack > 0
 		    && pmu->flush_branch_stack) {
 
-			pmu = cpuctx->ctx.pmu;
-
 			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 
 			perf_pmu_disable(pmu);
-- 
cgit v1.3-14-g43fede


From 4a2345937c17722bd2979f662ae909846b4a052a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 24 Feb 2014 12:43:31 +0100
Subject: perf: Optimize group_sched_in()

Use the ctx pmu instead of the event pmu.

When a group leader is a software event but the group contains
hardware events, the entire group is on the hardware PMU.

Using the hardware PMU for the transaction makes most sense since
that's the most expensive one to programm (and software PMUs generally
don't have TXN support anyway).

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Link: http://lkml.kernel.org/n/tip-sctoo9t2f3nn2c9g568928q3@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 823a53d72d6a..661951ab8ae7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1733,7 +1733,7 @@ group_sched_in(struct perf_event *group_event,
 	       struct perf_event_context *ctx)
 {
 	struct perf_event *event, *partial_group = NULL;
-	struct pmu *pmu = group_event->pmu;
+	struct pmu *pmu = ctx->pmu;
 	u64 now = ctx->time;
 	bool simulate = false;
 
-- 
cgit v1.3-14-g43fede


From 64be38ab03e9b238a1299857fef8b3707c0ed045 Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Thu, 27 Feb 2014 17:10:12 +0530
Subject: genirq: Include missing header file in irqdomain.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include appropriate header file include/linux/of_irq.h in
kernel/irq/irqdomain.c because it contains prototype definition of
function define in kernel/irq/irqdomain.c.

This eliminates the following warning in kernel/irq/irqdomain.c:
kernel/irq/irqdomain.c:468:14: warning: no previous prototype for ‘irq_create_of_mapping’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Link: http://lkml.kernel.org/r/eb89aebea7ff1a46122918ac389ebecf8248be9a.1393493276.git.rashika.kheria@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb36fe58..f14033700c25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
 #include <linux/mutex.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_irq.h>
 #include <linux/topology.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
-- 
cgit v1.3-14-g43fede


From 4729583006772b9530404bc1bb7c3aa4a10ffd4d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 27 Feb 2014 18:19:03 +0800
Subject: cpuset: fix a locking issue in cpuset_migrate_mm()

I can trigger a lockdep warning:

  # mount -t cgroup -o cpuset xxx /cgroup
  # mkdir /cgroup/cpuset
  # mkdir /cgroup/tmp
  # echo 0 > /cgroup/tmp/cpuset.cpus
  # echo 0 > /cgroup/tmp/cpuset.mems
  # echo 1 > /cgroup/tmp/cpuset.memory_migrate
  # echo $$ > /cgroup/tmp/tasks
  # echo 1 > /cgruop/tmp/cpuset.mems

  ===============================
  [ INFO: suspicious RCU usage. ]
  3.14.0-rc1-0.1-default+ #32 Not tainted
  -------------------------------
  include/linux/cgroup.h:682 suspicious rcu_dereference_check() usage!
  ...
    [<ffffffff81582174>] dump_stack+0x72/0x86
    [<ffffffff810b8f01>] lockdep_rcu_suspicious+0x101/0x140
    [<ffffffff81105ba1>] cpuset_migrate_mm+0xb1/0xe0
  ...

We used to hold cgroup_mutex when calling cpuset_migrate_mm(), but now
we hold cpuset_mutex, which causes task_css() to complain.

This is not a false-positive but a real issue.

Holding cpuset_mutex won't prevent a task from migrating to another
cpuset, and it won't prevent the original task->cgroup from destroying
during this change.

Fixes: 5d21cc2db040 (cpuset: replace cgroup_mutex locking with cpuset internal locking)
Cc: <stable@vger.kernel.org> # 3.9+
Signed-off-by: Li Zefan <lizefan@huawei.com>
Sigend-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f1..dba9e4aef69a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  *    Temporarilly set tasks mems_allowed to target nodes of migration,
  *    so that the migration code can allocate pages on these nodes.
  *
- *    Call holding cpuset_mutex, so current's cpuset won't change
- *    during this call, as manage_mutex holds off any cpuset_attach()
- *    calls.  Therefore we don't need to take task_lock around the
- *    call to guarantee_online_mems(), as we know no one is changing
- *    our task's cpuset.
- *
  *    While the mm_struct we are migrating is typically from some
  *    other task, the task_struct mems_allowed that we are hacking
  *    is for our current task, which must allocate new pages for that
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 
 	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
 
+	rcu_read_lock();
 	mems_cs = effective_nodemask_cpuset(task_cs(tsk));
 	guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+	rcu_read_unlock();
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 99afb0fd5f05aac467ffa85c36778fec4396209b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 27 Feb 2014 18:19:36 +0800
Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall()

It's not safe to access task's cpuset after releasing task_lock().
Holding callback_mutex won't help.

Cc: <stable@vger.kernel.org>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index dba9e4aef69a..e6b1b66afe52 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2482,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 
 	task_lock(current);
 	cs = nearest_hardwall_ancestor(task_cs(current));
+	allowed = node_isset(node, cs->mems_allowed);
 	task_unlock(current);
 
-	allowed = node_isset(node, cs->mems_allowed);
 	mutex_unlock(&callback_mutex);
 	return allowed;
 }
-- 
cgit v1.3-14-g43fede


From 864f32a52b53341c54a25953afb5b66ed79a7f76 Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Thu, 27 Feb 2014 17:50:19 +0530
Subject: kernel: Mark function as static in kernel/seccomp.c

Mark function as static in kernel/seccomp.c because it is not used
outside this file.

This eliminates the following warning in kernel/seccomp.c:
kernel/seccomp.c:296:6: warning: no previous prototype for ?seccomp_attach_user_filter? [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Kees Cook <keescook@chromium.org>
Acked-by: Will Drewry <wad@chromium.org>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 kernel/seccomp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..0e004a70f63a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -293,7 +293,7 @@ fail:
  *
  * Returns 0 on success and non-zero otherwise.
  */
-long seccomp_attach_user_filter(char __user *user_filter)
+static long seccomp_attach_user_filter(char __user *user_filter)
 {
 	struct sock_fprog fprog;
 	long ret = -EFAULT;
-- 
cgit v1.3-14-g43fede


From 48095d991d85687569ac025b18a6c7ae1632c9f7 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 3 Feb 2014 17:25:33 -0800
Subject: audit: Use struct net not pid_t to remember the network namespce to
 reply in

In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller.  This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a
pid_t (including the caller's network namespace changing, pid
wraparound, and the pid simply not being present).

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/audit.c       | 10 ++++++----
 kernel/audit.h       |  2 +-
 kernel/auditfilter.c |  3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..1e5756f16f6f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
 
 struct audit_reply {
 	__u32 portid;
-	pid_t pid;
+	struct net *net;	
 	struct sk_buff *skb;
 };
 
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
 {
 	struct audit_netlink_list *dest = _dest;
 	struct sk_buff *skb;
-	struct net *net = get_net_ns_by_pid(dest->pid);
+	struct net *net = dest->net;
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 
 	/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
 	while ((skb = __skb_dequeue(&dest->q)) != NULL)
 		netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
 
+	put_net(net);
 	kfree(dest);
 
 	return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
 static int audit_send_reply_thread(void *arg)
 {
 	struct audit_reply *reply = (struct audit_reply *)arg;
-	struct net *net = get_net_ns_by_pid(reply->pid);
+	struct net *net = reply->net;
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 
 	mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
 	/* Ignore failure. It'll only happen if the sender goes away,
 	   because our timeout is set to infinite. */
 	netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+	put_net(net);
 	kfree(reply);
 	return 0;
 }
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
 	if (!skb)
 		goto out;
 
+	reply->net = get_net(current->nsproxy->net_ns);
 	reply->portid = portid;
-	reply->pid = task_pid_vnr(current);
 	reply->skb = skb;
 
 	tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..8df132214606 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void		    audit_panic(const char *message);
 
 struct audit_netlink_list {
 	__u32 portid;
-	pid_t pid;
+	struct net *net;
 	struct sk_buff_head q;
 };
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..a5e3d73d73e4 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
+#include <net/net_namespace.h>
 #include "audit.h"
 
 /*
@@ -1083,8 +1084,8 @@ int audit_list_rules_send(__u32 portid, int seq)
 	dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
 	if (!dest)
 		return -ENOMEM;
+	dest->net = get_net(current->nsproxy->net_ns);
 	dest->portid = portid;
-	dest->pid = task_pid_vnr(current);
 	skb_queue_head_init(&dest->q);
 
 	mutex_lock(&audit_filter_mutex);
-- 
cgit v1.3-14-g43fede


From 04b73469750050290cb0a773e7ecf2358d65f6d5 Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Thu, 27 Feb 2014 17:13:53 +0530
Subject: PM / sleep: Move prototype declaration to header file
 kernel/power/power.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move prototype declaration of function to header file
kernel/power/power.h because it is used by more than one file.

This eliminates the following warning in kernel/power/snapshot.c:

kernel/power/snapshot.c:1588:16: warning: no previous prototype for ‘swsusp_save’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/power.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7d4b7ffb3c1d..1ca753106557 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -49,6 +49,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
  */
 #define SPARE_PAGES	((1024 * 1024) >> PAGE_SHIFT)
 
+asmlinkage int swsusp_save(void);
+
 /* kernel/power/hibernate.c */
 extern bool freezer_test_done;
 
-- 
cgit v1.3-14-g43fede


From 6c5be2916565e8c710e9f0f7b43cf65a3ba39dd9 Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Thu, 27 Feb 2014 17:15:54 +0530
Subject: PM / wakeup: Include appropriate header file in
 kernel/power/wakelock.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include appropriate header file kernel/power/power.h in
kernel/power/wakelock.c because it has prototype declaration of function
defined in kernel/power/wakelock.c.

This eliminates the following warning in kernel/power/wakelock.c:
kernel/power/wakelock.c:34:9: warning: no previous prototype for ‘pm_show_wakelocks’ [-Wmissing-prototypes]
kernel/power/wakelock.c:184:5: warning: no previous prototype for ‘pm_wake_lock’ [-Wmissing-prototypes]
kernel/power/wakelock.c:232:5: warning: no previous prototype for ‘pm_wake_unlock’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/wakelock.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 8f50de394d22..019069c84ff6 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -18,6 +18,8 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 
+#include "power.h"
+
 static DEFINE_MUTEX(wakelocks_lock);
 
 struct wakelock {
-- 
cgit v1.3-14-g43fede


From 421a5fa1a6cfc037a21220b638d4def6da7cbabe Mon Sep 17 00:00:00 2001
From: Sebastian Capella <sebastian.capella@linaro.org>
Date: Fri, 14 Feb 2014 14:52:56 -0800
Subject: PM / hibernate: use name_to_dev_t to parse resume

Use the name_to_dev_t call to parse the device name echo'd to
to /sys/power/resume.  This imitates the method used in hibernate.c
in software_resume, and allows the resume partition to be specified
using other equivalent device formats as well.  By allowing
/sys/debug/resume to accept the same syntax as the resume=device
parameter, we can parse the resume=device in the init script and
use the resume device directly from the kernel command line.

Signed-off-by: Sebastian Capella <sebastian.capella@linaro.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/hibernate.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 37170d4dd9a6..f4f2073711d3 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -973,16 +973,20 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
 static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
 			    const char *buf, size_t n)
 {
-	unsigned int maj, min;
 	dev_t res;
-	int ret = -EINVAL;
+	int len = n;
+	char *name;
 
-	if (sscanf(buf, "%u:%u", &maj, &min) != 2)
-		goto out;
+	if (len && buf[len-1] == '\n')
+		len--;
+	name = kstrndup(buf, len, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
 
-	res = MKDEV(maj,min);
-	if (maj != MAJOR(res) || min != MINOR(res))
-		goto out;
+	res = name_to_dev_t(name);
+	kfree(name);
+	if (!res)
+		return -EINVAL;
 
 	lock_system_sleep();
 	swsusp_resume_device = res;
@@ -990,9 +994,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
 	printk(KERN_INFO "PM: Starting manual resume from disk\n");
 	noresume = 0;
 	software_resume();
-	ret = n;
- out:
-	return ret;
+	return n;
 }
 
 power_attr(resume);
-- 
cgit v1.3-14-g43fede


From 6f285b19d09f72e801525f5eea1bdad22e559bf0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2014 19:44:55 -0800
Subject: audit: Send replies in the proper network namespace.

In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ.  Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 include/linux/audit.h |  3 ++-
 kernel/audit.c        | 21 ++++++++++-----------
 kernel/auditfilter.c  |  7 +++++--
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
 struct mqstat;
 struct audit_watch;
 struct audit_tree;
+struct sk_buff;
 
 struct audit_krule {
 	int			vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
 extern int audit_filter_type(int type);
 extern int audit_rule_change(int type, __u32 portid, int seq,
 				void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 1e5756f16f6f..32086bff5564 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
  * Allocates an skb, builds the netlink message, and sends it to the port id.
  * No failure notifications.
  */
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
 			     int multi, const void *payload, int size)
 {
+	u32 portid = NETLINK_CB(request_skb).portid;
+	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
 	struct sk_buff *skb;
 	struct task_struct *tsk;
 	struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
 	if (!skb)
 		goto out;
 
-	reply->net = get_net(current->nsproxy->net_ns);
+	reply->net = get_net(net);
 	reply->portid = portid;
 	reply->skb = skb;
 
@@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)
 
 	seq = nlmsg_hdr(skb)->nlmsg_seq;
 
-	audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
-			 &af, sizeof(af));
+	audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
 
 	return 0;
 }
@@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.backlog		= skb_queue_len(&audit_skb_queue);
 		s.version		= AUDIT_VERSION_LATEST;
 		s.backlog_wait_time	= audit_backlog_wait_time;
-		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
-				 &s, sizeof(s));
+		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
 	case AUDIT_SET: {
@@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					   seq, data, nlmsg_len(nlh));
 		break;
 	case AUDIT_LIST_RULES:
-		err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+		err = audit_list_rules_send(skb, seq);
 		break;
 	case AUDIT_TRIM:
 		audit_trim_trees();
@@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			memcpy(sig_data->ctx, ctx, len);
 			security_release_secctx(ctx, len);
 		}
-		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
-				0, 0, sig_data, sizeof(*sig_data) + len);
+		audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+				 sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
 		break;
 	case AUDIT_TTY_GET: {
@@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.log_passwd = tsk->signal->audit_tty_log_passwd;
 		spin_unlock(&tsk->sighand->siglock);
 
-		audit_send_reply(NETLINK_CB(skb).portid, seq,
-				 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+		audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
 	case AUDIT_TTY_SET: {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a5e3d73d73e4..e8d1c7c515d7 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -30,6 +30,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <net/net_namespace.h>
+#include <net/sock.h>
 #include "audit.h"
 
 /*
@@ -1069,8 +1070,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
  * @portid: target portid for netlink audit messages
  * @seq: netlink audit message sequence (serial) number
  */
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
 {
+	u32 portid = NETLINK_CB(request_skb).portid;
+	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
 	struct task_struct *tsk;
 	struct audit_netlink_list *dest;
 	int err = 0;
@@ -1084,7 +1087,7 @@ int audit_list_rules_send(__u32 portid, int seq)
 	dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
 	if (!dest)
 		return -ENOMEM;
-	dest->net = get_net(current->nsproxy->net_ns);
+	dest->net = get_net(net);
 	dest->portid = portid;
 	skb_queue_head_init(&dest->q);
 
-- 
cgit v1.3-14-g43fede


From 64e8d20bd39b81994d9cd60e7b42f8ec8652f5af Mon Sep 17 00:00:00 2001
From: Rashika Kheria <rashika.kheria@gmail.com>
Date: Thu, 27 Feb 2014 17:25:54 +0530
Subject: kernel: Include appropriate header file in time/timekeeping_debug.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Include appropriate header file kernel/time/timekeeping_internal.h in
kernel/time/timekeeping_debug.c because it has prototype declaration of
function defined in kernel/time/timekeeping_debug.c.

This eliminates the following warning in
kernel/time/timekeeping_debug.c:
kernel/time/timekeeping_debug.c:68:6: warning: no previous prototype for ‘tk_debug_account_sleep_time’ [-Wmissing-prototypes]

Signed-off-by: Rashika Kheria <rashika.kheria@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping_debug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index 802433a4f5eb..4d54f97558df 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -21,6 +21,8 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 
+#include "timekeeping_internal.h"
+
 static unsigned int sleep_time_bin[32] = {0};
 
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
-- 
cgit v1.3-14-g43fede


From 03b8c7b623c80af264c4c8d6111e5c6289933666 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 2 Mar 2014 13:09:47 +0100
Subject: futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic()
 test

If an architecture has futex_atomic_cmpxchg_inatomic() implemented and there
is no runtime check necessary, allow to skip the test within futex_init().

This allows to get rid of some code which would always give the same result,
and also allows the compiler to optimize a couple of if statements away.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Finn Thain <fthain@telegraphics.com.au>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: http://lkml.kernel.org/r/20140302120947.GA3641@osiris
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/s390/Kconfig     |  1 +
 include/linux/futex.h |  4 ++++
 init/Kconfig          |  7 +++++++
 kernel/futex.c        | 37 ++++++++++++++++++++++++-------------
 4 files changed, 36 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 65a07750f4f9..bb74b21f007a 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -117,6 +117,7 @@ config S390
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZ4
diff --git a/include/linux/futex.h b/include/linux/futex.h
index b0d95cac826e..6435f46d6e13 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -55,7 +55,11 @@ union futex_key {
 #ifdef CONFIG_FUTEX
 extern void exit_robust_list(struct task_struct *curr);
 extern void exit_pi_state_list(struct task_struct *curr);
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
 extern int futex_cmpxchg_enabled;
+#endif
 #else
 static inline void exit_robust_list(struct task_struct *curr)
 {
diff --git a/init/Kconfig b/init/Kconfig
index 009a797dd242..d56cb03c1b49 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1387,6 +1387,13 @@ config FUTEX
 	  support for "fast userspace mutexes".  The resulting kernel may not
 	  run glibc-based applications correctly.
 
+config HAVE_FUTEX_CMPXCHG
+	bool
+	help
+	  Architectures should select this if futex_atomic_cmpxchg_inatomic()
+	  is implemented and always working. This removes a couple of runtime
+	  checks.
+
 config EPOLL
 	bool "Enable eventpoll support" if EXPERT
 	default y
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..5d17e3a83f8c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -157,7 +157,9 @@
  * enqueue.
  */
 
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
 int __read_mostly futex_cmpxchg_enabled;
+#endif
 
 /*
  * Futex flags used to encode options to functions and preserve them across
@@ -2843,9 +2845,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
 }
 
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
 {
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
 	u32 curval;
+
+	/*
+	 * This will fail and we want it. Some arch implementations do
+	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
+	 * functionality. We want to know that before we call in any
+	 * of the complex code paths. Also we want to prevent
+	 * registration of robust lists in that case. NULL is
+	 * guaranteed to fault and we get -EFAULT on functional
+	 * implementation, the non-functional ones will return
+	 * -ENOSYS.
+	 */
+	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+		futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
 	unsigned int futex_shift;
 	unsigned long i;
 
@@ -2861,18 +2882,8 @@ static int __init futex_init(void)
 					       &futex_shift, NULL,
 					       futex_hashsize, futex_hashsize);
 	futex_hashsize = 1UL << futex_shift;
-	/*
-	 * This will fail and we want it. Some arch implementations do
-	 * runtime detection of the futex_atomic_cmpxchg_inatomic()
-	 * functionality. We want to know that before we call in any
-	 * of the complex code paths. Also we want to prevent
-	 * registration of robust lists in that case. NULL is
-	 * guaranteed to fault and we get -EFAULT on functional
-	 * implementation, the non-functional ones will return
-	 * -ENOSYS.
-	 */
-	if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
-		futex_cmpxchg_enabled = 1;
+
+	futex_detect_cmpxchg();
 
 	for (i = 0; i < futex_hashsize; i++) {
 		plist_head_init(&futex_queues[i].chain);
-- 
cgit v1.3-14-g43fede


From b8dadcb58d542ecbf1d3dae5fefcd3fd8cb26539 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 3 Mar 2014 17:28:36 -0500
Subject: cpuset: use rcu_read_lock() to protect task_cs()

We no longer use task_lock() to protect tsk->cgroups.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cpuset.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d8bec21d7a11..8d5324583aa4 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2239,10 +2239,10 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
 	struct cpuset *cpus_cs;
 
 	mutex_lock(&callback_mutex);
-	task_lock(tsk);
+	rcu_read_lock();
 	cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
 	guarantee_online_cpus(cpus_cs, pmask);
-	task_unlock(tsk);
+	rcu_read_unlock();
 	mutex_unlock(&callback_mutex);
 }
 
@@ -2295,10 +2295,10 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
 	nodemask_t mask;
 
 	mutex_lock(&callback_mutex);
-	task_lock(tsk);
+	rcu_read_lock();
 	mems_cs = effective_nodemask_cpuset(task_cs(tsk));
 	guarantee_online_mems(mems_cs, &mask);
-	task_unlock(tsk);
+	rcu_read_unlock();
 	mutex_unlock(&callback_mutex);
 
 	return mask;
@@ -2414,9 +2414,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	mutex_lock(&callback_mutex);
 
-	task_lock(current);
+	rcu_read_lock();
 	cs = nearest_hardwall_ancestor(task_cs(current));
-	task_unlock(current);
+	rcu_read_unlock();
 
 	allowed = node_isset(node, cs->mems_allowed);
 	mutex_unlock(&callback_mutex);
@@ -2543,24 +2543,26 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  * @task: pointer to task_struct of some task.
  *
  * Description: Prints @task's name, cpuset name, and cached copy of its
- * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
- * dereferencing task_cs(task).
+ * mems_allowed to the kernel log.
  */
 void cpuset_print_task_mems_allowed(struct task_struct *tsk)
 {
 	 /* Statically allocated to prevent using excess stack. */
 	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
 	static DEFINE_SPINLOCK(cpuset_buffer_lock);
-	struct cgroup *cgrp = task_cs(tsk)->css.cgroup;
+	struct cgroup *cgrp;
 
 	spin_lock(&cpuset_buffer_lock);
+	rcu_read_lock();
 
+	cgrp = task_cs(tsk)->css.cgroup;
 	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
 			   tsk->mems_allowed);
 	printk(KERN_INFO "%s cpuset=", tsk->comm);
 	pr_cont_cgroup_name(cgrp);
 	pr_cont(" mems_allowed=%s\n", cpuset_nodelist);
 
+	rcu_read_unlock();
 	spin_unlock(&cpuset_buffer_lock);
 }
 
@@ -2592,9 +2594,9 @@ int cpuset_memory_pressure_enabled __read_mostly;
 
 void __cpuset_memory_pressure_bump(void)
 {
-	task_lock(current);
+	rcu_read_lock();
 	fmeter_markevent(&task_cs(current)->fmeter);
-	task_unlock(current);
+	rcu_read_unlock();
 }
 
 #ifdef CONFIG_PROC_PID_CPUSET
-- 
cgit v1.3-14-g43fede


From 45ab2813d40d88fc575e753c38478de242d03f88 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 26 Feb 2014 13:37:38 -0500
Subject: tracing: Do not add event files for modules that fail tracepoints

If a module fails to add its tracepoints due to module tainting, do not
create the module event infrastructure in the debugfs directory. As the events
will not work and worse yet, they will silently fail, making the user wonder
why the events they enable do not display anything.

Having a warning on module load and the events not visible to the users
will make the cause of the problem much clearer.

Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org

Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT"
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: stable@vger.kernel.org # 2.6.31+
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h  |  6 ++++++
 kernel/trace/trace_events.c | 10 ++++++++++
 kernel/tracepoint.c         |  7 ++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index accc497f8d72..7159a0a933df 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -60,6 +60,12 @@ struct tp_module {
 	unsigned int num_tracepoints;
 	struct tracepoint * const *tracepoints_ptrs;
 };
+bool trace_module_has_bad_taint(struct module *mod);
+#else
+static inline bool trace_module_has_bad_taint(struct module *mod)
+{
+	return false;
+}
 #endif /* CONFIG_MODULES */
 
 struct tracepoint_iter {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e71ffd4eccb5..f3989ceb5cd5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1777,6 +1777,16 @@ static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_event_call **call, **start, **end;
 
+	if (!mod->num_trace_events)
+		return;
+
+	/* Don't add infrastructure for mods without tracepoints */
+	if (trace_module_has_bad_taint(mod)) {
+		pr_err("%s: module has bad taint, not creating trace events\n",
+		       mod->name);
+		return;
+	}
+
 	start = mod->trace_events;
 	end = mod->trace_events + mod->num_trace_events;
 
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c9..031cc5655a51 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
 #ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
+{
+	return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+}
+
 static int tracepoint_module_coming(struct module *mod)
 {
 	struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
 	 * module headers (for forced load), to make sure we don't cause a crash.
 	 * Staging and out-of-tree GPL modules are fine.
 	 */
-	if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
+	if (trace_module_has_bad_taint(mod))
 		return 0;
 	mutex_lock(&tracepoints_mutex);
 	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
-- 
cgit v1.3-14-g43fede


From 7dec935a3aa04412cba2cebe1524ae0d34a30c24 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 26 Feb 2014 10:54:36 -0500
Subject: tracepoint: Do not waste memory on mods with no tracepoints

No reason to allocate tp_module structures for modules that have no
tracepoints. This just wastes memory.

Fixes: b75ef8b44b1c "Tracepoint: Dissociate from module mutex"
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c9..0d4ef26574ff 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -636,6 +636,9 @@ static int tracepoint_module_coming(struct module *mod)
 	struct tp_module *tp_mod, *iter;
 	int ret = 0;
 
+	if (!mod->num_tracepoints)
+		return 0;
+
 	/*
 	 * We skip modules that taint the kernel, especially those with different
 	 * module headers (for forced load), to make sure we don't cause a crash.
@@ -679,6 +682,9 @@ static int tracepoint_module_going(struct module *mod)
 {
 	struct tp_module *pos;
 
+	if (!mod->num_tracepoints)
+		return 0;
+
 	mutex_lock(&tracepoints_mutex);
 	tracepoint_update_probe_range(mod->tracepoints_ptrs,
 		mod->tracepoints_ptrs + mod->num_tracepoints);
-- 
cgit v1.3-14-g43fede


From c24a4a369419c360c323865b91198878275c1481 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 28 Feb 2014 14:15:21 +0530
Subject: timer: Check failure of timer_cpu_notify() before calling
 init_timer_stats()

timer_cpu_notify() should return NOTIFY_OK and nothing else. Anything else would
trigger a BUG_ON(). Return value of this routine is already checked correctly
but is done after issuing a call to init_timer_stats(). The right order would be
to check the error case first and then call init_timer_stats(). Lets do it.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: fweisbec@gmail.com
Cc: tj@kernel.org
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/c439f5b6bbc2047e1662f4d523350531425bcf9d.1393576981.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index a71bdfdb51e7..31824ef3eb96 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1681,9 +1681,9 @@ void __init init_timers(void)
 
 	err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 			       (void *)(long)smp_processor_id());
-	init_timer_stats();
-
 	BUG_ON(err != NOTIFY_OK);
+
+	init_timer_stats();
 	register_cpu_notifier(&timers_nb);
 	open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
 }
-- 
cgit v1.3-14-g43fede


From 38edbb0b913d73713c23dcc742669f7e78b52aa7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 28 Feb 2014 14:15:22 +0530
Subject: timer: Make sure TIMER_FLAG_MASK bits are free in allocated base

Currently we are using two lowest bit of base for internal purpose and
so they both should be zero in the allocated address. The code was
doing the right thing before this patch came in: commit c5f66e99b
(timer: Implement TIMER_IRQSAFE)

Tejun probably forgot to update this piece of code which checks if the
lowest 'n' bits are zero or not and so wasn't updated according to the
new flag. Lets use TIMER_FLAG_MASK in the calculations here.

[ tglx: Massaged changelog ]

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: fweisbec@gmail.com
Cc: tj@kernel.org
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/9144e10d7e854a0aa8a673332adec356d81a923c.1393576981.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 31824ef3eb96..949d74ea0ce4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1555,9 +1555,8 @@ static int init_timers_cpu(int cpu)
 			if (!base)
 				return -ENOMEM;
 
-			/* Make sure that tvec_base is 2 byte aligned */
-			if (tbase_get_deferrable(base)) {
-				WARN_ON(1);
+			/* Make sure tvec_base has TIMER_FLAG_MASK bits free */
+			if (WARN_ON(base != tbase_get_base(base))) {
 				kfree(base);
 				return -ENOMEM;
 			}
-- 
cgit v1.3-14-g43fede


From 792d0018a5fe31ef8ef9d07a7a02081d4abdf6b7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Feb 2014 21:40:14 +0000
Subject: genirq: Add a kstat helper to increment irq stats

There is a common pattern all over the place:

      kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));

This results in a call to core code anyway. So provide a function
which does the same thing in core.

While at it, replace the butt ugly macro with an inline.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140223212737.422068876@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kernel_stat.h | 12 +++++++-----
 kernel/irq/irqdesc.c        |  5 +++++
 2 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 51c72be4a7c3..54ec7e0a7d72 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -54,11 +54,13 @@ extern unsigned long long nr_context_switches(void);
 #include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
 
-#define kstat_incr_irqs_this_cpu(irqno, DESC)		\
-do {							\
-	__this_cpu_inc(*(DESC)->kstat_irqs);		\
-	__this_cpu_inc(kstat.irqs_sum);			\
-} while (0)
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+{
+	__this_cpu_inc(*desc->kstat_irqs);
+	__this_cpu_inc(kstat.irqs_sum);
+}
+
+extern void kstat_incr_irq_this_cpu(unsigned int irq);
 
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
 {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 8ab8e9390297..a7174617616b 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -489,6 +489,11 @@ void dynamic_irq_cleanup(unsigned int irq)
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
+void kstat_incr_irq_this_cpu(unsigned int irq)
+{
+	kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
+}
+
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-- 
cgit v1.3-14-g43fede


From 8f945a3325bbe0dd651e2f496a53df9b06fc6d07 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Feb 2014 21:40:23 +0000
Subject: genirq: Move kstat_incr_irqs_this_cpu() to core

No more users outside the core code. Put it into the poison
cabinet. That also gets rid of the linux/irq.h include in
kernel_stat.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140223212739.124207133@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/kernel_stat.h | 8 --------
 kernel/irq/internals.h      | 7 +++++++
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 54ec7e0a7d72..c3fbda7690d6 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -51,15 +51,7 @@ DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 
 extern unsigned long long nr_context_switches(void);
 
-#include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-
-static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
-{
-	__this_cpu_inc(*desc->kstat_irqs);
-	__this_cpu_inc(kstat.irqs_sum);
-}
-
 extern void kstat_incr_irq_this_cpu(unsigned int irq);
 
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index d61ac29e32d0..17b671713d5f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -6,6 +6,7 @@
  * of this file for your non core code.
  */
 #include <linux/irqdesc.h>
+#include <linux/kernel_stat.h>
 
 #ifdef CONFIG_SPARSE_IRQ
 # define IRQ_BITMAP_BITS	(NR_IRQS + 8196)
@@ -180,3 +181,9 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
 {
 	return d->state_use_accessors & mask;
 }
+
+static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc)
+{
+	__this_cpu_inc(*desc->kstat_irqs);
+	__this_cpu_inc(kstat.irqs_sum);
+}
-- 
cgit v1.3-14-g43fede


From af5040da01ef980670b3741b3e10733ee3e33566 Mon Sep 17 00:00:00 2001
From: Roman Pen <r.peniaev@gmail.com>
Date: Tue, 4 Mar 2014 23:13:10 +0900
Subject: blktrace: fix accounting of partially completed requests

trace_block_rq_complete does not take into account that request can
be partially completed, so we can get the following incorrect output
of blkparser:

  C   R 232 + 240 [0]
  C   R 240 + 232 [0]
  C   R 248 + 224 [0]
  C   R 256 + 216 [0]

but should be:

  C   R 232 + 8 [0]
  C   R 240 + 8 [0]
  C   R 248 + 8 [0]
  C   R 256 + 8 [0]

Also, the whole output summary statistics of completed requests and
final throughput will be incorrect.

This patch takes into account real completion size of the request and
fixes wrong completion accounting.

Signed-off-by: Roman Pen <r.peniaev@gmail.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: linux-kernel@vger.kernel.org
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c             |  2 +-
 block/blk-mq.c               |  2 +-
 include/trace/events/block.h | 33 ++++++++++++++++++++++++++++++---
 kernel/trace/blktrace.c      | 20 +++++++++++---------
 4 files changed, 43 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 853f92749202..99e20cca37e1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2354,7 +2354,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (!req->bio)
 		return false;
 
-	trace_block_rq_complete(req->q, req);
+	trace_block_rq_complete(req->q, req, nr_bytes);
 
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6468a715a0e4..01d8735db8d3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -309,7 +309,7 @@ void blk_mq_end_io(struct request *rq, int error)
 	struct bio *bio = rq->bio;
 	unsigned int bytes = 0;
 
-	trace_block_rq_complete(rq->q, rq);
+	trace_block_rq_complete(rq->q, rq, blk_rq_bytes(rq));
 
 	while (bio) {
 		struct bio *next = bio->bi_next;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index e76ae19a8d6f..e8a5eca1dbe5 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -132,6 +132,7 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
  * block_rq_complete - block IO operation completed by device driver
  * @q: queue containing the block operation request
  * @rq: block operations request
+ * @nr_bytes: number of completed bytes
  *
  * The block_rq_complete tracepoint event indicates that some portion
  * of operation request has been completed by the device driver.  If
@@ -139,11 +140,37 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
  * do for the request. If @rq->bio is non-NULL then there is
  * additional work required to complete the request.
  */
-DEFINE_EVENT(block_rq_with_error, block_rq_complete,
+TRACE_EVENT(block_rq_complete,
 
-	TP_PROTO(struct request_queue *q, struct request *rq),
+	TP_PROTO(struct request_queue *q, struct request *rq,
+		 unsigned int nr_bytes),
 
-	TP_ARGS(q, rq)
+	TP_ARGS(q, rq, nr_bytes),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	RWBS_LEN	)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_rq_pos(rq);
+		__entry->nr_sector = nr_bytes >> 9;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 
 DECLARE_EVENT_CLASS(block_rq,
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b418cb0d7242..4f3a3c03eadb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -702,6 +702,7 @@ void blk_trace_shutdown(struct request_queue *q)
  * blk_add_trace_rq - Add a trace for a request oriented action
  * @q:		queue the io is for
  * @rq:		the source request
+ * @nr_bytes:	number of completed bytes
  * @what:	the action
  *
  * Description:
@@ -709,7 +710,7 @@ void blk_trace_shutdown(struct request_queue *q)
  *
  **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-			     u32 what)
+			     unsigned int nr_bytes, u32 what)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -718,11 +719,11 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
+		__blk_add_trace(bt, 0, nr_bytes, rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+		__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
 				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
@@ -730,33 +731,34 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 static void blk_add_trace_rq_abort(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT);
 }
 
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT);
 }
 
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE);
 }
 
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE);
 }
 
 static void blk_add_trace_rq_complete(void *ignore,
 				      struct request_queue *q,
-				      struct request *rq)
+				      struct request *rq,
+				      unsigned int nr_bytes)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+	blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE);
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From 62a6fa97684ed4c124564ea92500ecd513d60611 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 3 Mar 2014 16:11:13 +0100
Subject: kernel/compat: convert to COMPAT_SYSCALL_DEFINE

Convert all compat system call functions where all parameter types
have a size of four or less than four bytes, or are pointer types
to COMPAT_SYSCALL_DEFINE.
The implicit casts within COMPAT_SYSCALL_DEFINE will perform proper
zero and sign extension to 64 bit of all parameters if needed.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/compat.c | 90 ++++++++++++++++++++++++++++-----------------------------
 kernel/ptrace.c |  4 +--
 2 files changed, 47 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/compat.c b/kernel/compat.c
index 0a09e481b70b..2622011a44c9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -110,8 +110,8 @@ static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
 	return 0;
 }
 
-asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
-		struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv,
+		       struct timezone __user *, tz)
 {
 	if (tv) {
 		struct timeval ktv;
@@ -127,8 +127,8 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
 	return 0;
 }
 
-asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
-		struct timezone __user *tz)
+COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv,
+		       struct timezone __user *, tz)
 {
 	struct timespec kts;
 	struct timezone ktz;
@@ -236,8 +236,8 @@ static long compat_nanosleep_restart(struct restart_block *restart)
 	return ret;
 }
 
-asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
-				     struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
+		       struct compat_timespec __user *, rmtp)
 {
 	struct timespec tu, rmt;
 	mm_segment_t oldfs;
@@ -328,7 +328,7 @@ static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
 	return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
 }
 
-asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
 {
 	if (tbuf) {
 		struct tms tms;
@@ -354,7 +354,7 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
  * types that can be passed to put_user()/get_user().
  */
 
-asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
 {
 	old_sigset_t s;
 	long ret;
@@ -424,8 +424,8 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
 
 #endif
 
-asmlinkage long compat_sys_setrlimit(unsigned int resource,
-		struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
 {
 	struct rlimit r;
 
@@ -443,8 +443,8 @@ asmlinkage long compat_sys_setrlimit(unsigned int resource,
 
 #ifdef COMPAT_RLIM_OLD_INFINITY
 
-asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
-		struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
 {
 	struct rlimit r;
 	int ret;
@@ -470,8 +470,8 @@ asmlinkage long compat_sys_old_getrlimit(unsigned int resource,
 
 #endif
 
-asmlinkage long compat_sys_getrlimit(unsigned int resource,
-		struct compat_rlimit __user *rlim)
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+		       struct compat_rlimit __user *, rlim)
 {
 	struct rlimit r;
 	int ret;
@@ -596,9 +596,9 @@ static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
 	return compat_get_bitmap(k, user_mask_ptr, len * 8);
 }
 
-asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid,
-					     unsigned int len,
-					     compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_setaffinity, compat_pid_t, pid,
+		       unsigned int, len,
+		       compat_ulong_t __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
@@ -616,8 +616,8 @@ out:
 	return retval;
 }
 
-asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
-					     compat_ulong_t __user *user_mask_ptr)
+COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
+		       compat_ulong_t __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
@@ -662,9 +662,9 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
 	return 0;
 }
 
-long compat_sys_timer_create(clockid_t which_clock,
-			struct compat_sigevent __user *timer_event_spec,
-			timer_t __user *created_timer_id)
+COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
+		       struct compat_sigevent __user *, timer_event_spec,
+		       timer_t __user *, created_timer_id)
 {
 	struct sigevent __user *event = NULL;
 
@@ -680,9 +680,9 @@ long compat_sys_timer_create(clockid_t which_clock,
 	return sys_timer_create(which_clock, event, created_timer_id);
 }
 
-long compat_sys_timer_settime(timer_t timer_id, int flags,
-			  struct compat_itimerspec __user *new,
-			  struct compat_itimerspec __user *old)
+COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
+		       struct compat_itimerspec __user *, new,
+		       struct compat_itimerspec __user *, old)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -703,8 +703,8 @@ long compat_sys_timer_settime(timer_t timer_id, int flags,
 	return err;
 }
 
-long compat_sys_timer_gettime(timer_t timer_id,
-		struct compat_itimerspec __user *setting)
+COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
+		       struct compat_itimerspec __user *, setting)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -720,8 +720,8 @@ long compat_sys_timer_gettime(timer_t timer_id,
 	return err;
 }
 
-long compat_sys_clock_settime(clockid_t which_clock,
-		struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -737,8 +737,8 @@ long compat_sys_clock_settime(clockid_t which_clock,
 	return err;
 }
 
-long compat_sys_clock_gettime(clockid_t which_clock,
-		struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -754,8 +754,8 @@ long compat_sys_clock_gettime(clockid_t which_clock,
 	return err;
 }
 
-long compat_sys_clock_adjtime(clockid_t which_clock,
-		struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
+		       struct compat_timex __user *, utp)
 {
 	struct timex txc;
 	mm_segment_t oldfs;
@@ -777,8 +777,8 @@ long compat_sys_clock_adjtime(clockid_t which_clock,
 	return ret;
 }
 
-long compat_sys_clock_getres(clockid_t which_clock,
-		struct compat_timespec __user *tp)
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
+		       struct compat_timespec __user *, tp)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -818,9 +818,9 @@ static long compat_clock_nanosleep_restart(struct restart_block *restart)
 	return err;
 }
 
-long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
-			    struct compat_timespec __user *rqtp,
-			    struct compat_timespec __user *rmtp)
+COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
+		       struct compat_timespec __user *, rqtp,
+		       struct compat_timespec __user *, rmtp)
 {
 	long err;
 	mm_segment_t oldfs;
@@ -1010,7 +1010,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
 
-asmlinkage long compat_sys_time(compat_time_t __user * tloc)
+COMPAT_SYSCALL_DEFINE1(time, compat_time_t __user *, tloc)
 {
 	compat_time_t i;
 	struct timeval tv;
@@ -1026,7 +1026,7 @@ asmlinkage long compat_sys_time(compat_time_t __user * tloc)
 	return i;
 }
 
-asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
+COMPAT_SYSCALL_DEFINE1(stime, compat_time_t __user *, tptr)
 {
 	struct timespec tv;
 	int err;
@@ -1046,7 +1046,7 @@ asmlinkage long compat_sys_stime(compat_time_t __user *tptr)
 
 #endif /* __ARCH_WANT_COMPAT_SYS_TIME */
 
-asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
+COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
 {
 	struct timex txc;
 	int err, ret;
@@ -1085,10 +1085,10 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
 	return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
 
-asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
-			compat_ulong_t maxnode,
-			const compat_ulong_t __user *old_nodes,
-			const compat_ulong_t __user *new_nodes)
+COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
+		       compat_ulong_t, maxnode,
+		       const compat_ulong_t __user *, old_nodes,
+		       const compat_ulong_t __user *, new_nodes)
 {
 	unsigned long __user *old = NULL;
 	unsigned long __user *new = NULL;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1f4bcb3cc21c..adf98622cb32 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1180,8 +1180,8 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
 	return ret;
 }
 
-asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
-				  compat_long_t addr, compat_long_t data)
+COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid,
+		       compat_long_t, addr, compat_long_t, data)
 {
 	struct task_struct *child;
 	long ret;
-- 
cgit v1.3-14-g43fede


From ca2c405ab90591dcb1bc3765467cbdf2b99a0f6a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 4 Mar 2014 17:13:42 +0100
Subject: kexec/compat: convert to COMPAT_SYSCALL_DEFINE with changing
 parameter types

In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that
performs proper zero and sign extension convert all 64 bit parameters
to their corresponding 32 bit compat counterparts.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 include/linux/compat.h | 6 +++---
 include/linux/kexec.h  | 6 ------
 kernel/kexec.c         | 8 ++++----
 3 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8e636211f334..ef4834c5bcab 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -641,10 +641,10 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
 		u32 val3);
 asmlinkage long compat_sys_getsockopt(int fd, int level, int optname,
 				      char __user *optval, int __user *optlen);
-asmlinkage long compat_sys_kexec_load(unsigned long entry,
-				      unsigned long nr_segments,
+asmlinkage long compat_sys_kexec_load(compat_ulong_t entry,
+				      compat_ulong_t nr_segments,
 				      struct compat_kexec_segment __user *,
-				      unsigned long flags);
+				      compat_ulong_t flags);
 asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
 			const struct compat_mq_attr __user *u_mqstat,
 			struct compat_mq_attr __user *u_omqstat);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 6d4066cdb5b5..a75641930049 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -127,12 +127,6 @@ extern asmlinkage long sys_kexec_load(unsigned long entry,
 					struct kexec_segment __user *segments,
 					unsigned long flags);
 extern int kernel_kexec(void);
-#ifdef CONFIG_COMPAT
-extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
-				unsigned long nr_segments,
-				struct compat_kexec_segment __user *segments,
-				unsigned long flags);
-#endif
 extern struct page *kimage_alloc_control_pages(struct kimage *image,
 						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 60bafbed06ab..45601cf41bee 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1039,10 +1039,10 @@ void __weak crash_unmap_reserved_pages(void)
 {}
 
 #ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_kexec_load(unsigned long entry,
-				unsigned long nr_segments,
-				struct compat_kexec_segment __user *segments,
-				unsigned long flags)
+COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
+		       compat_ulong_t, nr_segments,
+		       struct compat_kexec_segment __user *, segments,
+		       compat_ulong_t, flags)
 {
 	struct compat_kexec_segment in;
 	struct kexec_segment out, __user *ksegments;
-- 
cgit v1.3-14-g43fede


From 2f2728f6de9837abe4b354443a45be578fbbf942 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 4 Mar 2014 17:18:23 +0100
Subject: mm/compat: convert to COMPAT_SYSCALL_DEFINE with changing parameter
 types

In order to allow the COMPAT_SYSCALL_DEFINE macro generate code that
performs proper zero and sign extension convert all 64 bit parameters
to their corresponding 32 bit compat counterparts.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 include/linux/compat.h | 10 +++++-----
 kernel/compat.c        | 10 +++++-----
 mm/process_vm_access.c | 26 ++++++++++++--------------
 3 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ef4834c5bcab..7c765624b7ef 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -469,7 +469,7 @@ asmlinkage long compat_sys_timerfd_settime(int ufd, int flags,
 asmlinkage long compat_sys_timerfd_gettime(int ufd,
 				   struct compat_itimerspec __user *otmr);
 
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page,
+asmlinkage long compat_sys_move_pages(pid_t pid, compat_ulong_t nr_pages,
 				      __u32 __user *pages,
 				      const int __user *nodes,
 				      int __user *status,
@@ -674,12 +674,12 @@ extern void __user *compat_alloc_user_space(unsigned long len);
 
 asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid,
 		const struct compat_iovec __user *lvec,
-		unsigned long liovcnt, const struct compat_iovec __user *rvec,
-		unsigned long riovcnt, unsigned long flags);
+		compat_ulong_t liovcnt, const struct compat_iovec __user *rvec,
+		compat_ulong_t riovcnt, compat_ulong_t flags);
 asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 		const struct compat_iovec __user *lvec,
-		unsigned long liovcnt, const struct compat_iovec __user *rvec,
-		unsigned long riovcnt, unsigned long flags);
+		compat_ulong_t liovcnt, const struct compat_iovec __user *rvec,
+		compat_ulong_t riovcnt, compat_ulong_t flags);
 
 asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
 				    compat_off_t __user *offset, compat_size_t count);
diff --git a/kernel/compat.c b/kernel/compat.c
index 2622011a44c9..488ff8c4cf48 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1065,11 +1065,11 @@ COMPAT_SYSCALL_DEFINE1(adjtimex, struct compat_timex __user *, utp)
 }
 
 #ifdef CONFIG_NUMA
-asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
-		compat_uptr_t __user *pages32,
-		const int __user *nodes,
-		int __user *status,
-		int flags)
+COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
+		       compat_uptr_t __user *, pages32,
+		       const int __user *, nodes,
+		       int __user *, status,
+		       int, flags)
 {
 	const void __user * __user *pages;
 	int i;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index fd26d0433509..3c5cf68566ec 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -456,25 +456,23 @@ free_iovecs:
 	return rc;
 }
 
-asmlinkage ssize_t
-compat_sys_process_vm_readv(compat_pid_t pid,
-			    const struct compat_iovec __user *lvec,
-			    unsigned long liovcnt,
-			    const struct compat_iovec __user *rvec,
-			    unsigned long riovcnt,
-			    unsigned long flags)
+COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid,
+		       const struct compat_iovec __user *, lvec,
+		       compat_ulong_t, liovcnt,
+		       const struct compat_iovec __user *, rvec,
+		       compat_ulong_t, riovcnt,
+		       compat_ulong_t, flags)
 {
 	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
 				    riovcnt, flags, 0);
 }
 
-asmlinkage ssize_t
-compat_sys_process_vm_writev(compat_pid_t pid,
-			     const struct compat_iovec __user *lvec,
-			     unsigned long liovcnt,
-			     const struct compat_iovec __user *rvec,
-			     unsigned long riovcnt,
-			     unsigned long flags)
+COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid,
+		       const struct compat_iovec __user *, lvec,
+		       compat_ulong_t, liovcnt,
+		       const struct compat_iovec __user *, rvec,
+		       compat_ulong_t, riovcnt,
+		       compat_ulong_t, flags)
 {
 	return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
 				    riovcnt, flags, 1);
-- 
cgit v1.3-14-g43fede


From 1d6bae966e90134bcfd7807b8f9488d55198de91 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 9 Aug 2012 19:16:14 -0400
Subject: tracing: Move raw output code from macro to standalone function

The code for trace events to format the raw recorded event data
into human readable format in the 'trace' file is repeated for every
event in the system. When you have over 500 events, this can add up
quite a bit.

By making helper functions in the core kernel to do the work
instead, we can shrink the size of the kernel down a bit.

With a kernel configured with 502 events, the change in size was:

   text    data     bss     dec     hex filename
12991007        1913568 9785344 24689919        178bcff /tmp/vmlinux.orig
12990946        1913568 9785344 24689858        178bcc2 /tmp/vmlinux.patched

Note, this version does not save as much as the version of this patch
I had a few years ago. That is because in the mean time, commit
f71130de5c7f ("tracing: Add a helper function for event print functions")
did a lot of the work my original patch did. But this change helps
slightly, and is part of a larger clean up to reduce the size much further.

Link: http://lkml.kernel.org/r/20120810034707.378538034@goodmis.org

Cc: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  5 +++++
 include/trace/ftrace.h       | 10 +---------
 kernel/trace/trace_output.c  | 31 +++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4e4cc28623ad..a91ab93d7250 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -163,6 +163,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
 
 void tracing_record_cmdline(struct task_struct *tsk);
 
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...);
+
 struct event_filter;
 
 enum trace_reg {
@@ -197,6 +199,9 @@ struct ftrace_event_class {
 extern int ftrace_event_reg(struct ftrace_event_call *event,
 			    enum trace_reg type, void *data);
 
+int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event,
+			char *fmt, ...);
+
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1a8b28db3775..3873d6e4697f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -265,11 +265,9 @@ static notrace enum print_line_t					\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 			 struct trace_event *event)			\
 {									\
-	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##template *field;				\
 	struct trace_entry *entry;					\
 	struct trace_seq *p = &iter->tmp_seq;				\
-	int ret;							\
 									\
 	entry = iter->ent;						\
 									\
@@ -281,13 +279,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 	field = (typeof(field))entry;					\
 									\
 	trace_seq_init(p);						\
-	ret = trace_seq_printf(s, "%s: ", #call);			\
-	if (ret)							\
-		ret = trace_seq_printf(s, print);			\
-	if (!ret)							\
-		return TRACE_TYPE_PARTIAL_LINE;				\
-									\
-	return TRACE_TYPE_HANDLED;					\
+	return ftrace_output_call(iter, #call, print);			\
 }									\
 static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 	.trace			= ftrace_raw_output_##call,		\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed32284fbe32..ca0e79e2abaa 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -439,6 +439,37 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
 }
 EXPORT_SYMBOL(ftrace_raw_output_prep);
 
+static int ftrace_output_raw(struct trace_iterator *iter, char *name,
+			     char *fmt, va_list ap)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	ret = trace_seq_printf(s, "%s: ", name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_vprintf(s, fmt, ap);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+
+	va_start(ap, fmt);
+	ret = ftrace_output_raw(iter, name, fmt, ap);
+	va_end(ap);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_output_call);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.3-14-g43fede


From 35bb4399bd0ef16b8a57fccea0047d98b6b0e7fb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 9 Aug 2012 22:26:46 -0400
Subject: tracing: Move event storage for array from macro to standalone
 function

The code that shows array fields for events is defined for all events.
This can add up quite a bit when you have over 500 events.

By making helper functions in the core kernel to do the work
instead, we can shrink the size of the kernel down a bit.

With a kernel configured with 502 events, the change in size was:

   text    data     bss     dec     hex filename
12990946        1913568 9785344 24689858        178bcc2 /tmp/vmlinux
12987390        1913504 9785344 24686238        178ae9e /tmp/vmlinux.patched

That's a total of 3556 bytes, which comes down to 7 bytes per event.
Although it's not much, this code is just called at initialization of
the events.

Link: http://lkml.kernel.org/r/20120810034708.084036335@goodmis.org

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  8 ++++----
 include/trace/ftrace.h       | 12 ++++--------
 kernel/trace/trace_events.c  |  6 ------
 kernel/trace/trace_export.c  | 12 ++++--------
 kernel/trace/trace_output.c  | 21 +++++++++++++++++++++
 5 files changed, 33 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a91ab93d7250..ffe642eb84fa 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -202,6 +202,10 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
 int ftrace_output_event(struct trace_iterator *iter, struct ftrace_event_call *event,
 			char *fmt, ...);
 
+int ftrace_event_define_field(struct ftrace_event_call *call,
+			      char *type, int len, char *item, int offset,
+			      int field_size, int sign, int filter);
+
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
@@ -500,10 +504,6 @@ enum {
 	FILTER_TRACE_FN,
 };
 
-#define EVENT_STORAGE_SIZE 128
-extern struct mutex event_storage_mutex;
-extern char event_storage[EVENT_STORAGE_SIZE];
-
 extern int trace_event_raw_init(struct ftrace_event_call *call);
 extern int trace_define_field(struct ftrace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 3873d6e4697f..54928faf2119 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -302,15 +302,11 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 #undef __array
 #define __array(type, item, len)					\
 	do {								\
-		mutex_lock(&event_storage_mutex);			\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		snprintf(event_storage, sizeof(event_storage),		\
-			 "%s[%d]", #type, len);				\
-		ret = trace_define_field(event_call, event_storage, #item, \
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item),			\
-				 is_signed_type(type), FILTER_OTHER);	\
-		mutex_unlock(&event_storage_mutex);			\
+		ret = ftrace_event_define_field(event_call, #type, len,	\
+				#item, offsetof(typeof(field), item),   \
+				sizeof(field.item),			\
+			 	is_signed_type(type), FILTER_OTHER); \
 		if (ret)						\
 			return ret;					\
 	} while (0);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e71ffd4eccb5..22826c73a9da 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
 
 DEFINE_MUTEX(event_mutex);
 
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
-
 LIST_HEAD(ftrace_events);
 static LIST_HEAD(ftrace_common_fields);
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..39c746c5ae73 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -96,14 +96,10 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #define __array(type, item, len)					\
 	do {								\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		mutex_lock(&event_storage_mutex);			\
-		snprintf(event_storage, sizeof(event_storage),		\
-			 "%s[%d]", #type, len);				\
-		ret = trace_define_field(event_call, event_storage, #item, \
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item),			\
-				 is_signed_type(type), filter_type);	\
-		mutex_unlock(&event_storage_mutex);			\
+		ret = ftrace_event_define_field(event_call, #type, len,	\
+				#item, offsetof(typeof(field), item),   \
+				sizeof(field.item),			\
+			 	is_signed_type(type), filter_type);	\
 		if (ret)						\
 			return ret;					\
 	} while (0);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ca0e79e2abaa..ee8d74840b88 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,6 +20,10 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+#define EVENT_STORAGE_SIZE 128
+static DEFINE_MUTEX(event_storage_mutex);
+static char event_storage[EVENT_STORAGE_SIZE];
+
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
 	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
@@ -470,6 +474,23 @@ int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(ftrace_output_call);
 
+int ftrace_event_define_field(struct ftrace_event_call *call,
+			      char *type, int len, char *item, int offset,
+			      int field_size, int sign, int filter)
+{
+	int ret;
+
+	mutex_lock(&event_storage_mutex);
+	snprintf(event_storage, sizeof(event_storage),
+		 "%s[%d]", type, len);
+	ret = trace_define_field(call, event_storage, item, offset,
+				 field_size, sign, filter);
+	mutex_unlock(&event_storage_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_define_field);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.3-14-g43fede


From 3fd40d1ee6a317523172ab95b6f7ea41ba8fcee3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 9 Aug 2012 22:42:57 -0400
Subject: tracing: Use helper functions in event assignment to shrink macro
 size

The functions that assign the contents for the ftrace events are
defined by the TRACE_EVENT() macros. Each event has its own unique
way to assign data to its buffer. When you have over 500 events,
that means there's 500 functions assigning data uniquely for each
event (not really that many, as DECLARE_EVENT_CLASS() and multiple
DEFINE_EVENT()s will only need a single function).

By making helper functions in the core kernel to do some of the work
instead, we can shrink the size of the kernel down a bit.

With a kernel configured with 502 events, the change in size was:

   text    data     bss     dec     hex filename
12987390        1913504 9785344 24686238        178ae9e /tmp/vmlinux
12959102        1913504 9785344 24657950        178401e /tmp/vmlinux.patched

That's a total of 28288 bytes, which comes down to 56 bytes per event.

Link: http://lkml.kernel.org/r/20120810034708.370808175@goodmis.org

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 15 +++++++++++++++
 include/trace/ftrace.h       | 22 ++++++----------------
 kernel/trace/trace_events.c  | 30 ++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index ffe642eb84fa..9d3fe0658398 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -206,6 +206,21 @@ int ftrace_event_define_field(struct ftrace_event_call *call,
 			      char *type, int len, char *item, int offset,
 			      int field_size, int sign, int filter);
 
+struct ftrace_event_buffer {
+	struct ring_buffer		*buffer;
+	struct ring_buffer_event	*event;
+	struct ftrace_event_file	*ftrace_file;
+	void				*entry;
+	unsigned long			flags;
+	int				pc;
+};
+
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+				  struct ftrace_event_file *ftrace_file,
+				  unsigned long len);
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
+
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 54928faf2119..1cc2265caa52 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -532,37 +532,27 @@ static notrace void							\
 ftrace_raw_event_##call(void *__data, proto)				\
 {									\
 	struct ftrace_event_file *ftrace_file = __data;			\
-	struct ftrace_event_call *event_call = ftrace_file->event_call;	\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
-	struct ring_buffer_event *event;				\
+	struct ftrace_event_buffer fbuffer;				\
 	struct ftrace_raw_##call *entry;				\
-	struct ring_buffer *buffer;					\
-	unsigned long irq_flags;					\
 	int __data_size;						\
-	int pc;								\
 									\
 	if (ftrace_trigger_soft_disabled(ftrace_file))			\
 		return;							\
 									\
-	local_save_flags(irq_flags);					\
-	pc = preempt_count();						\
-									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
-	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,	\
-				 event_call->event.type,		\
-				 sizeof(*entry) + __data_size,		\
-				 irq_flags, pc);			\
-	if (!event)							\
+	entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file,	\
+				 sizeof(*entry) + __data_size);		\
+									\
+	if (!entry)							\
 		return;							\
-	entry	= ring_buffer_event_data(event);			\
 									\
 	tstruct								\
 									\
 	{ assign; }							\
 									\
-	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, \
-				    irq_flags, pc);		       \
+	ftrace_event_buffer_commit(&fbuffer);				\
 }
 /*
  * The ftrace_test_probe is compiled out, it is only here as a build time check
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 22826c73a9da..b8f73b333a3c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -188,6 +188,36 @@ int trace_event_raw_init(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(trace_event_raw_init);
 
+void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
+				  struct ftrace_event_file *ftrace_file,
+				  unsigned long len)
+{
+	struct ftrace_event_call *event_call = ftrace_file->event_call;
+
+	local_save_flags(fbuffer->flags);
+	fbuffer->pc = preempt_count();
+	fbuffer->ftrace_file = ftrace_file;
+
+	fbuffer->event =
+		trace_event_buffer_lock_reserve(&fbuffer->buffer, ftrace_file,
+						event_call->event.type, len,
+						fbuffer->flags, fbuffer->pc);
+	if (!fbuffer->event)
+		return NULL;
+
+	fbuffer->entry = ring_buffer_event_data(fbuffer->event);
+	return fbuffer->entry;
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_reserve);
+
+void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer)
+{
+	event_trigger_unlock_commit(fbuffer->ftrace_file, fbuffer->buffer,
+				    fbuffer->event, fbuffer->entry,
+				    fbuffer->flags, fbuffer->pc);
+}
+EXPORT_SYMBOL_GPL(ftrace_event_buffer_commit);
+
 int ftrace_event_reg(struct ftrace_event_call *call,
 		     enum trace_reg type, void *data)
 {
-- 
cgit v1.3-14-g43fede


From b196e2b9e262be01737dc2bbf9e3c7c87340fa4d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 13 Feb 2014 15:45:07 -0500
Subject: tracing: Warn if a tracepoint is not set via debugfs

Tracepoints were made to allow enabling a tracepoint in a module before that
module was loaded. When a tracepoint is enabled and it does not exist, the
name is stored and will be enabled when the tracepoint is created.

The problem with this approach is that when a tracepoint is enabled when
it expects to be there, it gives no warning that it does not exist.

To add salt to the wound, if a module is added and sets the FORCED flag, which
can happen if it isn't signed properly, the tracepoint code will not enabled
the tracepoints, but they will be created in the debugfs system! When a user
goes to enable the tracepoint, the tracepoint code will not see it existing
and will think it is to be enabled later AND WILL NOT GIVE A WARNING.

The tracing will look like it succeeded but will actually be doing nothing.
This will cause lots of confusion and headaches for developers trying to
figure out why they are not seeing their tracepoints.

Link: http://lkml.kernel.org/r/20140213154507.4040fb06@gandalf.local.home

Reported-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reported-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0d4ef26574ff..0058f33d05c1 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -62,6 +62,7 @@ struct tracepoint_entry {
 	struct hlist_node hlist;
 	struct tracepoint_func *funcs;
 	int refcount;	/* Number of times armed. 0 if disarmed. */
+	int enabled;	/* Tracepoint enabled */
 	char name[0];
 };
 
@@ -237,6 +238,7 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
 	memcpy(&e->name[0], name, name_len);
 	e->funcs = NULL;
 	e->refcount = 0;
+	e->enabled = 0;
 	hlist_add_head(&e->hlist, head);
 	return e;
 }
@@ -316,6 +318,7 @@ static void tracepoint_update_probe_range(struct tracepoint * const *begin,
 		if (mark_entry) {
 			set_tracepoint(&mark_entry, *iter,
 					!!mark_entry->refcount);
+			mark_entry->enabled = !!mark_entry->refcount;
 		} else {
 			disable_tracepoint(*iter);
 		}
@@ -380,6 +383,8 @@ tracepoint_add_probe(const char *name, void *probe, void *data)
 int tracepoint_probe_register(const char *name, void *probe, void *data)
 {
 	struct tracepoint_func *old;
+	struct tracepoint_entry *entry;
+	int ret = 0;
 
 	mutex_lock(&tracepoints_mutex);
 	old = tracepoint_add_probe(name, probe, data);
@@ -388,9 +393,13 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
 		return PTR_ERR(old);
 	}
 	tracepoint_update_probes();		/* may update entry */
+	entry = get_tracepoint(name);
+	/* Make sure the entry was enabled */
+	if (!entry || !entry->enabled)
+		ret = -ENODEV;
 	mutex_unlock(&tracepoints_mutex);
 	release_probes(old);
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
-- 
cgit v1.3-14-g43fede


From 1dc43cf0be9a94a6a7273db284152db15c526106 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 24 Feb 2014 19:59:56 +0100
Subject: ftrace: Cleanup of global variables ftrace_new_pgs and
 ftrace_update_cnt

Some of them can be local to functions, so make them local and pass
them as parameters where needed:
* __start_mcount_loc+__stop_mcount_loc are local to ftrace_init
* ftrace_new_pgs -> new_pgs/start_pg
* ftrace_update_cnt -> local update_cnt in ftrace_update_code

Link: http://lkml.kernel.org/r/1393268401-24379-1-git-send-email-jslaby@suse.cz

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5313c1100d30..3f95bbeb8e8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1174,8 +1174,6 @@ struct ftrace_page {
 	int			size;
 };
 
-static struct ftrace_page *ftrace_new_pgs;
-
 #define ENTRY_SIZE sizeof(struct dyn_ftrace)
 #define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
 
@@ -2246,7 +2244,6 @@ static void ftrace_shutdown_sysctl(void)
 }
 
 static cycle_t		ftrace_update_time;
-static unsigned long	ftrace_update_cnt;
 unsigned long		ftrace_update_tot_cnt;
 
 static inline int ops_traces_mod(struct ftrace_ops *ops)
@@ -2302,11 +2299,12 @@ static int referenced_filters(struct dyn_ftrace *rec)
 	return cnt;
 }
 
-static int ftrace_update_code(struct module *mod)
+static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
 {
 	struct ftrace_page *pg;
 	struct dyn_ftrace *p;
 	cycle_t start, stop;
+	unsigned long update_cnt = 0;
 	unsigned long ref = 0;
 	bool test = false;
 	int i;
@@ -2332,9 +2330,8 @@ static int ftrace_update_code(struct module *mod)
 	}
 
 	start = ftrace_now(raw_smp_processor_id());
-	ftrace_update_cnt = 0;
 
-	for (pg = ftrace_new_pgs; pg; pg = pg->next) {
+	for (pg = new_pgs; pg; pg = pg->next) {
 
 		for (i = 0; i < pg->index; i++) {
 			int cnt = ref;
@@ -2355,7 +2352,7 @@ static int ftrace_update_code(struct module *mod)
 			if (!ftrace_code_disable(mod, p))
 				break;
 
-			ftrace_update_cnt++;
+			update_cnt++;
 
 			/*
 			 * If the tracing is enabled, go ahead and enable the record.
@@ -2374,11 +2371,9 @@ static int ftrace_update_code(struct module *mod)
 		}
 	}
 
-	ftrace_new_pgs = NULL;
-
 	stop = ftrace_now(raw_smp_processor_id());
 	ftrace_update_time = stop - start;
-	ftrace_update_tot_cnt += ftrace_update_cnt;
+	ftrace_update_tot_cnt += update_cnt;
 
 	return 0;
 }
@@ -4270,9 +4265,6 @@ static int ftrace_process_locs(struct module *mod,
 	/* Assign the last page to ftrace_pages */
 	ftrace_pages = pg;
 
-	/* These new locations need to be initialized */
-	ftrace_new_pgs = start_pg;
-
 	/*
 	 * We only need to disable interrupts on start up
 	 * because we are modifying code that an interrupt
@@ -4283,7 +4275,7 @@ static int ftrace_process_locs(struct module *mod,
 	 */
 	if (!mod)
 		local_irq_save(flags);
-	ftrace_update_code(mod);
+	ftrace_update_code(mod, start_pg);
 	if (!mod)
 		local_irq_restore(flags);
 	ret = 0;
@@ -4392,11 +4384,10 @@ struct notifier_block ftrace_module_exit_nb = {
 	.priority = INT_MIN,	/* Run after anything that can remove kprobes */
 };
 
-extern unsigned long __start_mcount_loc[];
-extern unsigned long __stop_mcount_loc[];
-
 void __init ftrace_init(void)
 {
+	extern unsigned long __start_mcount_loc[];
+	extern unsigned long __stop_mcount_loc[];
 	unsigned long count, addr, flags;
 	int ret;
 
-- 
cgit v1.3-14-g43fede


From c867ccd8388d1c1a31bef9c54544b2ef32f0ebca Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 24 Feb 2014 19:59:57 +0100
Subject: ftrace: Inline the code from ftrace_dyn_table_alloc()

The function used to do allocations some time ago. This no longer
happens and it only checks the count and prints some info. This patch
inlines the body to the only caller. There are two reasons:
* the name of the function was misleading
* it's clear what is going on in ftrace_init now

Link: http://lkml.kernel.org/r/1393268401-24379-2-git-send-email-jslaby@suse.cz

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3f95bbeb8e8d..76b6ed29d856 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2465,22 +2465,6 @@ ftrace_allocate_pages(unsigned long num_to_init)
 	return NULL;
 }
 
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
-{
-	int cnt;
-
-	if (!num_to_init) {
-		pr_info("ftrace: No functions to be traced?\n");
-		return -1;
-	}
-
-	cnt = num_to_init / ENTRIES_PER_PAGE;
-	pr_info("ftrace: allocating %ld entries in %d pages\n",
-		num_to_init, cnt + 1);
-
-	return 0;
-}
-
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
@@ -4403,10 +4387,13 @@ void __init ftrace_init(void)
 		goto failed;
 
 	count = __stop_mcount_loc - __start_mcount_loc;
-
-	ret = ftrace_dyn_table_alloc(count);
-	if (ret)
+	if (!count) {
+		pr_info("ftrace: No functions to be traced?\n");
 		goto failed;
+	}
+
+	pr_info("ftrace: allocating %ld entries in %ld pages\n",
+		count, count / ENTRIES_PER_PAGE + 1);
 
 	last_ftrace_enabled = ftrace_enabled = 1;
 
-- 
cgit v1.3-14-g43fede


From af64a7cb09db77344c596a0bf3d57d77257e8bf5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 24 Feb 2014 19:59:58 +0100
Subject: ftrace: Pass retval through return in ftrace_dyn_arch_init()

No architecture uses the "data" parameter in ftrace_dyn_arch_init() in any
way, it just sets the value to 0. And this is used as a return value
in the caller -- ftrace_init, which just checks the retval against
zero.

Note there is also "return 0" in every ftrace_dyn_arch_init.  So it is
enough to check the retval and remove all the indirect sets of data on
all archs.

Link: http://lkml.kernel.org/r/1393268401-24379-3-git-send-email-jslaby@suse.cz

Cc: linux-arch@vger.kernel.org
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-design.txt | 3 ---
 arch/arm/kernel/ftrace.c              | 2 --
 arch/blackfin/kernel/ftrace.c         | 3 ---
 arch/ia64/kernel/ftrace.c             | 2 --
 arch/metag/kernel/ftrace.c            | 3 ---
 arch/microblaze/kernel/ftrace.c       | 3 ---
 arch/mips/kernel/ftrace.c             | 3 ---
 arch/powerpc/kernel/ftrace.c          | 5 -----
 arch/s390/kernel/ftrace.c             | 1 -
 arch/sh/kernel/ftrace.c               | 3 ---
 arch/sparc/kernel/ftrace.c            | 4 ----
 arch/tile/kernel/ftrace.c             | 2 --
 arch/x86/kernel/ftrace.c              | 3 ---
 kernel/trace/ftrace.c                 | 6 ++----
 14 files changed, 2 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 79fcafc7fd64..117168884023 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -360,9 +360,6 @@ function below should be sufficient for most people:
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* return value is done indirectly via data */
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 34e56647dcee..5cd0d05edf35 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -158,8 +158,6 @@ int ftrace_make_nop(struct module *mod,
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c
index 9277905b82cf..f74c5ae6a25b 100644
--- a/arch/blackfin/kernel/ftrace.c
+++ b/arch/blackfin/kernel/ftrace.c
@@ -67,9 +67,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* return value is done indirectly via data */
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
index 7fc8c961b1f7..cfaa93a8bbdf 100644
--- a/arch/ia64/kernel/ftrace.c
+++ b/arch/ia64/kernel/ftrace.c
@@ -200,7 +200,5 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 /* run from kstop_machine */
 int __init ftrace_dyn_arch_init(void *data)
 {
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c
index a774f321643f..bf593932b353 100644
--- a/arch/metag/kernel/ftrace.c
+++ b/arch/metag/kernel/ftrace.c
@@ -119,8 +119,5 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 /* run from kstop_machine */
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* The return code is returned via data */
-	writel(0, data);
-
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c
index e8a5e9cf4ed1..ffa595c7fec2 100644
--- a/arch/microblaze/kernel/ftrace.c
+++ b/arch/microblaze/kernel/ftrace.c
@@ -173,9 +173,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* The return code is retured via data */
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 
diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c
index 185ba258361b..013016bec9e1 100644
--- a/arch/mips/kernel/ftrace.c
+++ b/arch/mips/kernel/ftrace.c
@@ -206,9 +206,6 @@ int __init ftrace_dyn_arch_init(void *data)
 	/* Remove "b ftrace_stub" to ensure ftrace_caller() is executed */
 	ftrace_modify_code(MCOUNT_ADDR, INSN_NOP);
 
-	/* The return code is retured via data */
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 #endif	/* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 9b27b293a922..d059664cdf16 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -533,11 +533,6 @@ void arch_ftrace_update_code(int command)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* caller expects data to be zero */
-	unsigned long *p = data;
-
-	*p = 0;
-
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 224db03e9518..77b2f3a1f50a 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -132,7 +132,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	*(unsigned long *) data = 0;
 	return 0;
 }
 
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c
index 30e13196d35b..493997541d2c 100644
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -274,9 +274,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* The return code is retured via data */
-	__raw_writel(0, (unsigned long)data);
-
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c
index 03ab022e51c5..ee813b82da49 100644
--- a/arch/sparc/kernel/ftrace.c
+++ b/arch/sparc/kernel/ftrace.c
@@ -84,10 +84,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	unsigned long *p = data;
-
-	*p = 0;
-
 	return 0;
 }
 #endif
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
index f1c452092eeb..34d9ea0bca9f 100644
--- a/arch/tile/kernel/ftrace.c
+++ b/arch/tile/kernel/ftrace.c
@@ -169,8 +169,6 @@ int ftrace_make_nop(struct module *mod,
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 8cabf638cb63..bbe5a5b88aad 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -670,9 +670,6 @@ void arch_ftrace_update_code(int command)
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	/* The return code is retured via data */
-	*(unsigned long *)data = 0;
-
 	return 0;
 }
 #endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76b6ed29d856..083c6d5fce25 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4379,11 +4379,9 @@ void __init ftrace_init(void)
 	addr = (unsigned long)ftrace_stub;
 
 	local_irq_save(flags);
-	ftrace_dyn_arch_init(&addr);
+	ret = ftrace_dyn_arch_init(&addr);
 	local_irq_restore(flags);
-
-	/* ftrace_dyn_arch_init places the return code in addr */
-	if (addr)
+	if (ret)
 		goto failed;
 
 	count = __stop_mcount_loc - __start_mcount_loc;
-- 
cgit v1.3-14-g43fede


From 3a36cb11ca65cd6804972eaf1000378ba4384ea7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 24 Feb 2014 19:59:59 +0100
Subject: ftrace: Do not pass data to ftrace_dyn_arch_init

As the data parameter is not really used by any ftrace_dyn_arch_init,
remove that from ftrace_dyn_arch_init. This also removes the addr
local variable from ftrace_init which is now unused.

Note the documentation was imprecise as it did not suggest to set
(*data) to 0.

Link: http://lkml.kernel.org/r/1393268401-24379-4-git-send-email-jslaby@suse.cz

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: linux-arch@vger.kernel.org
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace-design.txt | 2 +-
 arch/arm/kernel/ftrace.c              | 2 +-
 arch/blackfin/kernel/ftrace.c         | 2 +-
 arch/ia64/kernel/ftrace.c             | 2 +-
 arch/metag/kernel/ftrace.c            | 2 +-
 arch/microblaze/kernel/ftrace.c       | 2 +-
 arch/mips/kernel/ftrace.c             | 2 +-
 arch/powerpc/kernel/ftrace.c          | 2 +-
 arch/s390/kernel/ftrace.c             | 2 +-
 arch/sh/kernel/ftrace.c               | 2 +-
 arch/sparc/kernel/ftrace.c            | 2 +-
 arch/tile/kernel/ftrace.c             | 2 +-
 arch/x86/kernel/ftrace.c              | 2 +-
 include/linux/ftrace.h                | 2 +-
 kernel/trace/ftrace.c                 | 7 ++-----
 15 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt
index 117168884023..3f669b9e8852 100644
--- a/Documentation/trace/ftrace-design.txt
+++ b/Documentation/trace/ftrace-design.txt
@@ -358,7 +358,7 @@ Every arch has an init callback function.  If you need to do something early on
 to initialize some state, this is the time to do that.  Otherwise, this simple
 function below should be sufficient for most people:
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 5cd0d05edf35..c108ddcb9ba4 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -156,7 +156,7 @@ int ftrace_make_nop(struct module *mod,
 	return ret;
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c
index f74c5ae6a25b..095de0fa044d 100644
--- a/arch/blackfin/kernel/ftrace.c
+++ b/arch/blackfin/kernel/ftrace.c
@@ -65,7 +65,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ftrace_modify_code(ip, call, sizeof(call));
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c
index cfaa93a8bbdf..3b0c2aa07857 100644
--- a/arch/ia64/kernel/ftrace.c
+++ b/arch/ia64/kernel/ftrace.c
@@ -198,7 +198,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 }
 
 /* run from kstop_machine */
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c
index bf593932b353..ed1d685157c2 100644
--- a/arch/metag/kernel/ftrace.c
+++ b/arch/metag/kernel/ftrace.c
@@ -117,7 +117,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 }
 
 /* run from kstop_machine */
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c
index ffa595c7fec2..bbcd2533766c 100644
--- a/arch/microblaze/kernel/ftrace.c
+++ b/arch/microblaze/kernel/ftrace.c
@@ -171,7 +171,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	return ret;
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c
index 013016bec9e1..1ba7afe6ab74 100644
--- a/arch/mips/kernel/ftrace.c
+++ b/arch/mips/kernel/ftrace.c
@@ -198,7 +198,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ftrace_modify_code(FTRACE_CALL_IP, new);
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	/* Encode the instructions when booting */
 	ftrace_dyn_arch_init_insns();
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index d059664cdf16..71ce4cbb7e9f 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -531,7 +531,7 @@ void arch_ftrace_update_code(int command)
 		ftrace_disable_ftrace_graph_caller();
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 77b2f3a1f50a..54d6493c4a56 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -130,7 +130,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return 0;
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c
index 493997541d2c..3c74f53db6db 100644
--- a/arch/sh/kernel/ftrace.c
+++ b/arch/sh/kernel/ftrace.c
@@ -272,7 +272,7 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 	return ftrace_modify_code(rec->ip, old, new);
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c
index ee813b82da49..0a2d2ddff543 100644
--- a/arch/sparc/kernel/ftrace.c
+++ b/arch/sparc/kernel/ftrace.c
@@ -82,7 +82,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	return ftrace_modify_code(ip, old, new);
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c
index 34d9ea0bca9f..8d52d83cc516 100644
--- a/arch/tile/kernel/ftrace.c
+++ b/arch/tile/kernel/ftrace.c
@@ -167,7 +167,7 @@ int ftrace_make_nop(struct module *mod,
 	return ret;
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bbe5a5b88aad..4b66adf17369 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -668,7 +668,7 @@ void arch_ftrace_update_code(int command)
 	atomic_dec(&modifying_ftrace_code);
 }
 
-int __init ftrace_dyn_arch_init(void *data)
+int __init ftrace_dyn_arch_init(void)
 {
 	return 0;
 }
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e6141be2fad5..1bbb2cd631de 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -423,7 +423,7 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable);
 
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
-extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_dyn_arch_init(void);
 extern void ftrace_replace_code(int enable);
 extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 083c6d5fce25..5bd70e8b09b0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4372,14 +4372,11 @@ void __init ftrace_init(void)
 {
 	extern unsigned long __start_mcount_loc[];
 	extern unsigned long __stop_mcount_loc[];
-	unsigned long count, addr, flags;
+	unsigned long count, flags;
 	int ret;
 
-	/* Keep the ftrace pointer to the stub */
-	addr = (unsigned long)ftrace_stub;
-
 	local_irq_save(flags);
-	ret = ftrace_dyn_arch_init(&addr);
+	ret = ftrace_dyn_arch_init();
 	local_irq_restore(flags);
 	if (ret)
 		goto failed;
-- 
cgit v1.3-14-g43fede


From cd21067f69240041d36e491ff5597e0217615465 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Mon, 24 Feb 2014 17:12:21 +0100
Subject: ftrace: Warn on error when modifying ftrace function

We should print some warning and kill ftrace functionality when the ftrace
function is not set correctly. Otherwise, ftrace might do crazy things without
an explanation. The error value has been ignored so far.

Note that an error that happens during updating all the traced calls is handled
in ftrace_replace_code(). We print more details about the particular
failing address via ftrace_bug() there.

Link: http://lkml.kernel.org/r/1393258342-29978-3-git-send-email-pmladek@suse.cz

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5bd70e8b09b0..0e48ff4cefa5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1994,6 +1994,7 @@ int __weak ftrace_arch_code_modify_post_process(void)
 void ftrace_modify_all_code(int command)
 {
 	int update = command & FTRACE_UPDATE_TRACE_FUNC;
+	int err = 0;
 
 	/*
 	 * If the ftrace_caller calls a ftrace_ops func directly,
@@ -2005,8 +2006,11 @@ void ftrace_modify_all_code(int command)
 	 * to make sure the ops are having the right functions
 	 * traced.
 	 */
-	if (update)
-		ftrace_update_ftrace_func(ftrace_ops_list_func);
+	if (update) {
+		err = ftrace_update_ftrace_func(ftrace_ops_list_func);
+		if (FTRACE_WARN_ON(err))
+			return;
+	}
 
 	if (command & FTRACE_UPDATE_CALLS)
 		ftrace_replace_code(1);
@@ -2019,13 +2023,16 @@ void ftrace_modify_all_code(int command)
 		/* If irqs are disabled, we are in stop machine */
 		if (!irqs_disabled())
 			smp_call_function(ftrace_sync_ipi, NULL, 1);
-		ftrace_update_ftrace_func(ftrace_trace_function);
+		err = ftrace_update_ftrace_func(ftrace_trace_function);
+		if (FTRACE_WARN_ON(err))
+			return;
 	}
 
 	if (command & FTRACE_START_FUNC_RET)
-		ftrace_enable_ftrace_graph_caller();
+		err = ftrace_enable_ftrace_graph_caller();
 	else if (command & FTRACE_STOP_FUNC_RET)
-		ftrace_disable_ftrace_graph_caller();
+		err = ftrace_disable_ftrace_graph_caller();
+	FTRACE_WARN_ON(err);
 }
 
 static int __ftrace_modify_code(void *data)
-- 
cgit v1.3-14-g43fede


From f952d10ff40b436a8ef156a74ec327abe303823d Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 27 Jan 2014 17:38:42 -0500
Subject: audit: Use more current logging style again

Add pr_fmt to prefix "audit: " to output
Convert printk(KERN_<LEVEL> to pr_<level>
Coalesce formats

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditfilter.c | 12 +++++++-----
 kernel/auditsc.c     | 31 +++++++++++++++----------------
 2 files changed, 22 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384e..3152d1aea164 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -19,6 +19,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -247,7 +249,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
-		printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
+		pr_err("AUDIT_POSSIBLE is deprecated\n");
 		goto exit_err;
 	}
 	if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
@@ -477,8 +479,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			/* Keep currently invalid fields around in case they
 			 * become valid after a policy reload. */
 			if (err == -EINVAL) {
-				printk(KERN_WARNING "audit rule for LSM "
-				       "\'%s\' is invalid\n",  str);
+				pr_warn("audit rule for LSM \'%s\' is invalid\n",
+					str);
 				err = 0;
 			}
 			if (err) {
@@ -707,8 +709,8 @@ static inline int audit_dupe_lsm_field(struct audit_field *df,
 	/* Keep currently invalid fields around in case they
 	 * become valid after a policy reload. */
 	if (ret == -EINVAL) {
-		printk(KERN_WARNING "audit rule for LSM \'%s\' is "
-		       "invalid\n", df->lsm_str);
+		pr_warn("audit rule for LSM \'%s\' is invalid\n",
+			df->lsm_str);
 		ret = 0;
 	}
 
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 10176cd5956a..6874c1fd453d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -42,6 +42,8 @@
  * and <dustin.kirkland@us.ibm.com> for LSPP certification compliance.
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/init.h>
 #include <asm/types.h>
 #include <linux/atomic.h>
@@ -850,16 +852,15 @@ static inline void audit_free_names(struct audit_context *context)
 	if (context->put_count + context->ino_count != context->name_count) {
 		int i = 0;
 
-		printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
-		       " name_count=%d put_count=%d"
-		       " ino_count=%d [NOT freeing]\n",
-		       __FILE__, __LINE__,
+		pr_err("%s:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d ino_count=%d"
+		       " [NOT freeing]\n", __FILE__, __LINE__,
 		       context->serial, context->major, context->in_syscall,
 		       context->name_count, context->put_count,
 		       context->ino_count);
 		list_for_each_entry(n, &context->names_list, list) {
-			printk(KERN_ERR "names[%d] = %p = %s\n", i++,
-			       n->name, n->name->name ?: "(null)");
+			pr_err("names[%d] = %p = %s\n", i++, n->name,
+			       n->name->name ?: "(null)");
 		}
 		dump_stack();
 		return;
@@ -1550,7 +1551,7 @@ static inline void handle_one(const struct inode *inode)
 	if (likely(put_tree_ref(context, chunk)))
 		return;
 	if (unlikely(!grow_tree_refs(context))) {
-		printk(KERN_WARNING "out of memory, audit has lost a tree reference\n");
+		pr_warn("out of memory, audit has lost a tree reference\n");
 		audit_set_auditable(context);
 		audit_put_chunk(chunk);
 		unroll_tree_refs(context, p, count);
@@ -1609,8 +1610,7 @@ retry:
 			goto retry;
 		}
 		/* too bad */
-		printk(KERN_WARNING
-			"out of memory, audit has lost a tree reference\n");
+		pr_warn("out of memory, audit has lost a tree reference\n");
 		unroll_tree_refs(context, p, count);
 		audit_set_auditable(context);
 		return;
@@ -1682,7 +1682,7 @@ void __audit_getname(struct filename *name)
 
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
-		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		pr_err("%s:%d(:%d): ignoring getname(%p)\n",
 		       __FILE__, __LINE__, context->serial, name);
 		dump_stack();
 #endif
@@ -1721,15 +1721,15 @@ void audit_putname(struct filename *name)
 	BUG_ON(!context);
 	if (!context->in_syscall) {
 #if AUDIT_DEBUG == 2
-		printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
+		pr_err("%s:%d(:%d): final_putname(%p)\n",
 		       __FILE__, __LINE__, context->serial, name);
 		if (context->name_count) {
 			struct audit_names *n;
 			int i = 0;
 
 			list_for_each_entry(n, &context->names_list, list)
-				printk(KERN_ERR "name[%d] = %p = %s\n", i++,
-				       n->name, n->name->name ?: "(null)");
+				pr_err("name[%d] = %p = %s\n", i++, n->name,
+				       n->name->name ?: "(null)");
 			}
 #endif
 		final_putname(name);
@@ -1738,9 +1738,8 @@ void audit_putname(struct filename *name)
 	else {
 		++context->put_count;
 		if (context->put_count > context->name_count) {
-			printk(KERN_ERR "%s:%d(:%d): major=%d"
-			       " in_syscall=%d putname(%p) name_count=%d"
-			       " put_count=%d\n",
+			pr_err("%s:%d(:%d): major=%d in_syscall=%d putname(%p)"
+			       " name_count=%d put_count=%d\n",
 			       __FILE__, __LINE__,
 			       context->serial, context->major,
 			       context->in_syscall, name->name,
-- 
cgit v1.3-14-g43fede


From d211f177b28ec070c25b3d0b960aa55f352f731f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 8 Mar 2014 15:31:54 -0800
Subject: audit: Update kdoc for audit_send_reply and audit_list_rules_send

The kbuild test robot reported:
> tree:   git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace.git for-next
> head:   6f285b19d09f72e801525f5eea1bdad22e559bf0
> commit: 6f285b19d09f72e801525f5eea1bdad22e559bf0 [2/2] audit: Send replies in the proper network namespace.
> reproduce: make htmldocs
>
> >> Warning(kernel/audit.c:575): No description found for parameter 'request_skb'
> >> Warning(kernel/audit.c:575): Excess function parameter 'portid' description in 'audit_send_reply'
> >> Warning(kernel/auditfilter.c:1074): No description found for parameter 'request_skb'
> >> Warning(kernel/auditfilter.c:1074): Excess function parameter 'portid' description in 'audit_list_rules_s

Which was caused by my failure to update the kdoc annotations when I
updated the functions.  Fix that small oversight now.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/audit.c       | 2 +-
 kernel/auditfilter.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 32086bff5564..3392d3e0254a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -559,7 +559,7 @@ static int audit_send_reply_thread(void *arg)
 }
 /**
  * audit_send_reply - send an audit reply message via netlink
- * @portid: netlink port to which to send reply
+ * @request_skb: skb of request we are replying to (used to target the reply)
  * @seq: sequence number
  * @type: audit message type
  * @done: done (last) flag
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index e8d1c7c515d7..92062fd6cc8c 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1067,7 +1067,7 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
 
 /**
  * audit_list_rules_send - list the audit rules
- * @portid: target portid for netlink audit messages
+ * @request_skb: skb of request we are replying to (used to target the reply)
  * @seq: netlink audit message sequence (serial) number
  */
 int audit_list_rules_send(struct sk_buff *request_skb, int seq)
-- 
cgit v1.3-14-g43fede


From e97ca8e5b864f88b028c1759ba8536fa827d6d96 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 10 Mar 2014 15:49:43 -0700
Subject: mm: fix GFP_THISNODE callers and clarify

GFP_THISNODE is for callers that implement their own clever fallback to
remote nodes.  It restricts the allocation to the specified node and
does not invoke reclaim, assuming that the caller will take care of it
when the fallback fails, e.g.  through a subsequent allocation request
without GFP_THISNODE set.

However, many current GFP_THISNODE users only want the node exclusive
aspect of the flag, without actually implementing their own fallback or
triggering reclaim if necessary.  This results in things like page
migration failing prematurely even when there is easily reclaimable
memory available, unless kswapd happens to be running already or a
concurrent allocation attempt triggers the necessary reclaim.

Convert all callsites that don't implement their own fallback strategy
to __GFP_THISNODE.  This restricts the allocation a single node too, but
at the same time allows the allocator to enter the slowpath, wake
kswapd, and invoke direct reclaim if necessary, to make the allocation
happen when memory is full.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Jan Stancek <jstancek@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/uncached.c       |  2 +-
 arch/powerpc/platforms/cell/ras.c |  3 ++-
 drivers/misc/sgi-xp/xpc_uv.c      |  2 +-
 include/linux/gfp.h               |  4 ++++
 include/linux/mmzone.h            |  4 ++--
 include/linux/slab.h              |  2 +-
 kernel/profile.c                  |  4 ++--
 mm/migrate.c                      | 11 ++++++-----
 8 files changed, 19 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index a96bcf83a735..20e8a9b21d75 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -98,7 +98,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	/* attempt to allocate a granule's worth of cached memory pages */
 
 	page = alloc_pages_exact_node(nid,
-				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				IA64_GRANULE_SHIFT-PAGE_SHIFT);
 	if (!page) {
 		mutex_unlock(&uc_pool->add_chunk_mutex);
diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c
index 5ec1e47a0d77..e865d748179b 100644
--- a/arch/powerpc/platforms/cell/ras.c
+++ b/arch/powerpc/platforms/cell/ras.c
@@ -123,7 +123,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order)
 
 	area->nid = nid;
 	area->order = order;
-	area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE,
+	area->pages = alloc_pages_exact_node(area->nid,
+						GFP_KERNEL|__GFP_THISNODE,
 						area->order);
 
 	if (!area->pages) {
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c
index b9e2000969f0..95c894482fdd 100644
--- a/drivers/misc/sgi-xp/xpc_uv.c
+++ b/drivers/misc/sgi-xp/xpc_uv.c
@@ -240,7 +240,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name,
 
 	nid = cpu_to_node(cpu);
 	page = alloc_pages_exact_node(nid,
-				      GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				      GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				      pg_order);
 	if (page == NULL) {
 		dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d "
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0437439bc047..39b81dc7d01a 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -123,6 +123,10 @@ struct vm_area_struct;
 			 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \
 			 __GFP_NO_KSWAPD)
 
+/*
+ * GFP_THISNODE does not perform any reclaim, you most likely want to
+ * use __GFP_THISNODE to allocate from a given node without fallback!
+ */
 #ifdef CONFIG_NUMA
 #define GFP_THISNODE	(__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY)
 #else
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f2052c83154..9b61b9bf81ac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -590,10 +590,10 @@ static inline bool zone_is_empty(struct zone *zone)
 
 /*
  * The NUMA zonelists are doubled because we need zonelists that restrict the
- * allocations to a single node for GFP_THISNODE.
+ * allocations to a single node for __GFP_THISNODE.
  *
  * [0]	: Zonelist with fallback
- * [1]	: No fallback (GFP_THISNODE)
+ * [1]	: No fallback (__GFP_THISNODE)
  */
 #define MAX_ZONELISTS 2
 
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9260abdd67df..b5b2df60299e 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -410,7 +410,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
  *
  * %GFP_NOWAIT - Allocation will not sleep.
  *
- * %GFP_THISNODE - Allocate node-local memory only.
+ * %__GFP_THISNODE - Allocate node-local memory only.
  *
  * %GFP_DMA - Allocation suitable for DMA.
  *   Should only be used for kmalloc() caches. Otherwise, use a
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55ab..ebdd9c1a86b4 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -549,14 +549,14 @@ static int create_hash_tables(void)
 		struct page *page;
 
 		page = alloc_pages_exact_node(node,
-				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				0);
 		if (!page)
 			goto out_cleanup;
 		per_cpu(cpu_profile_hits, cpu)[1]
 				= (struct profile_hit *)page_address(page);
 		page = alloc_pages_exact_node(node,
-				GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+				GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
 				0);
 		if (!page)
 			goto out_cleanup;
diff --git a/mm/migrate.c b/mm/migrate.c
index 482a33d89134..b494fdb9a636 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1158,7 +1158,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 					pm->node);
 	else
 		return alloc_pages_exact_node(pm->node,
-				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
+				GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
 }
 
 /*
@@ -1544,9 +1544,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
 	struct page *newpage;
 
 	newpage = alloc_pages_exact_node(nid,
-					 (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
-					  __GFP_NOMEMALLOC | __GFP_NORETRY |
-					  __GFP_NOWARN) &
+					 (GFP_HIGHUSER_MOVABLE |
+					  __GFP_THISNODE | __GFP_NOMEMALLOC |
+					  __GFP_NORETRY | __GFP_NOWARN) &
 					 ~GFP_IOFS, 0);
 
 	return newpage;
@@ -1747,7 +1747,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 		goto out_dropref;
 
 	new_page = alloc_pages_node(node,
-		(GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+		(GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+		HPAGE_PMD_ORDER);
 	if (!new_page)
 		goto out_fail;
 
-- 
cgit v1.3-14-g43fede


From 7500d9363f7e356a5a3f10f1778f2e4f8a7eba94 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 10 Mar 2014 19:31:51 +0530
Subject: PM / suspend: Remove unnecessary !!

Double ! or !! are normally required to get 0 or 1 out of a expression. A
comparision always returns 0 or 1 and hence there is no need to apply double !
over it again.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/suspend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 62ee437b5c7e..90b3d9366d1a 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -39,7 +39,7 @@ static const struct platform_suspend_ops *suspend_ops;
 
 static bool need_suspend_ops(suspend_state_t state)
 {
-	return !!(state > PM_SUSPEND_FREEZE);
+	return state > PM_SUSPEND_FREEZE;
 }
 
 static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
-- 
cgit v1.3-14-g43fede


From d44753b843e093f9e1f2f14806fbe106fff74898 Mon Sep 17 00:00:00 2001
From: Juri Lelli <juri.lelli@gmail.com>
Date: Mon, 3 Mar 2014 12:09:21 +0100
Subject: sched/deadline: Deny unprivileged users to set/change SCHED_DEADLINE
 policy

Deny the use of SCHED_DEADLINE policy to unprivileged users.
Even if root users can set the policy for normal users, we
don't want the latter to be able to change their parameters
(safest behavior).

Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1393844961-18097-1-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6edbef296ece..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3338,6 +3338,15 @@ recheck:
 				return -EPERM;
 		}
 
+		 /*
+		  * Can't set/change SCHED_DEADLINE policy at all for now
+		  * (safest behavior); in the future we would like to allow
+		  * unprivileged DL tasks to increase their relative deadline
+		  * or reduce their runtime (both ways reducing utilization)
+		  */
+		if (dl_policy(policy))
+			return -EPERM;
+
 		/*
 		 * Treat SCHED_IDLE as nice 20. Only allow a switch to
 		 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
-- 
cgit v1.3-14-g43fede


From 177c53d943368fc97644ebc0a250dc8e2d124250 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 28 Feb 2014 13:39:05 +0100
Subject: stop_machine: Fix^2 race between stop_two_cpus() and stop_cpus()

We must use smp_call_function_single(.wait=1) for the
irq_cpu_stop_queue_work() to ensure the queueing is actually done under
stop_cpus_lock. Without this we could have dropped the lock by the time
we do the queueing and get the race we tried to fix.

Fixes: 7053ea1a34fa ("stop_machine: Fix race between stop_two_cpus() and stop_cpus()")

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/20140228123905.GK3104@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/stop_machine.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e09c907..01fbae5b97b7 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
 	 */
 	smp_call_function_single(min(cpu1, cpu2),
 				 &irq_cpu_stop_queue_work,
-				 &call_args, 0);
+				 &call_args, 1);
 	lg_local_unlock(&stop_cpus_lock);
 	preempt_enable();
 
-- 
cgit v1.3-14-g43fede


From 96b3d28bf4b00f62fc8386ff5d487d1830793a3d Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Date: Thu, 6 Mar 2014 14:25:28 +0900
Subject: sched/clock: Prevent tracing recursion in sched_clock_cpu()

Prevent tracing of preempt_disable/enable() in sched_clock_cpu().
When CONFIG_DEBUG_PREEMPT is enabled, preempt_disable/enable() are
traced and this causes trace_clock() users (and probably others) to
go into an infinite recursion. Systems with a stable sched_clock()
are not affected.

This problem is similar to that fixed by upstream commit 95ef1e52922
("KVM guest: prevent tracing recursion with kvmclock").

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1394083528.4524.3.camel@nexus
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/clock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc35761..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
 	if (unlikely(!sched_clock_running))
 		return 0ull;
 
-	preempt_disable();
+	preempt_disable_notrace();
 	scd = cpu_sdc(cpu);
 
 	if (cpu != smp_processor_id())
 		clock = sched_clock_remote(scd);
 	else
 		clock = sched_clock_local(scd);
-	preempt_enable();
+	preempt_enable_notrace();
 
 	return clock;
 }
-- 
cgit v1.3-14-g43fede


From 30cdd69e2a266505ca8229c944d361ff350a6959 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 3 Mar 2014 08:48:51 +0100
Subject: cpuidle/idle: Move the cpuidle_idle_call function to idle.c

The cpuidle_idle_call does nothing more than calling the three individuals
function and is no longer used by any arch specific code but only in the
cpuidle framework code.

We can move this function into the idle task code to ensure better
proximity to the scheduler code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: rjw@rjwysocki.net
Cc: preeti@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1393832934-11625-2-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 drivers/cpuidle/cpuidle.c | 49 -----------------------------------------
 include/linux/cpuidle.h   |  2 --
 kernel/sched/idle.c       | 56 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 1506d69b3f0f..166a7322a2b6 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -172,55 +172,6 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
 		cpuidle_curr_governor->reflect(dev, index);
 }
 
-/**
- * cpuidle_idle_call - the main idle loop
- *
- * NOTE: no locks or semaphores should be used here
- * return non-zero on failure
- */
-int cpuidle_idle_call(void)
-{
-	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
-	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
-	int next_state, entered_state, ret;
-	bool broadcast;
-
-	ret = cpuidle_enabled(drv, dev);
-	if (ret < 0)
-		return ret;
-
-	/* ask the governor for the next state */
-	next_state = cpuidle_select(drv, dev);
-
-	if (need_resched()) {
-		dev->last_residency = 0;
-		/* give the governor an opportunity to reflect on the outcome */
-		cpuidle_reflect(dev, next_state);
-		local_irq_enable();
-		return 0;
-	}
-
-	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
-
-	if (broadcast &&
-	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
-		return -EBUSY;
-
-	trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
-	entered_state = cpuidle_enter(drv, dev, next_state);
-
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
-
-	if (broadcast)
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
-
-	/* give the governor an opportunity to reflect on the outcome */
-	cpuidle_reflect(dev, entered_state);
-
-	return 0;
-}
-
 /**
  * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
  */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index accc2dd72049..8d97962d6d64 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -128,7 +128,6 @@ extern int cpuidle_enter(struct cpuidle_driver *drv,
 			 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
 
-extern int cpuidle_idle_call(void);
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 extern struct cpuidle_driver *cpuidle_get_driver(void);
 extern struct cpuidle_driver *cpuidle_driver_ref(void);
@@ -160,7 +159,6 @@ static inline int cpuidle_enter(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {return -ENODEV; }
 static inline void cpuidle_reflect(struct cpuidle_device *dev, int index) { }
-static inline int cpuidle_idle_call(void) { return -ENODEV; }
 static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
 {return -ENODEV; }
 static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b7976a127178..d5aaf5eb4531 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -63,6 +63,62 @@ void __weak arch_cpu_idle(void)
 	local_irq_enable();
 }
 
+#ifdef CONFIG_CPU_IDLE
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
+ */
+static int cpuidle_idle_call(void)
+{
+	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+	int next_state, entered_state, ret;
+	bool broadcast;
+
+	ret = cpuidle_enabled(drv, dev);
+	if (ret < 0)
+		return ret;
+
+	/* ask the governor for the next state */
+	next_state = cpuidle_select(drv, dev);
+
+	if (need_resched()) {
+		dev->last_residency = 0;
+		/* give the governor an opportunity to reflect on the outcome */
+		cpuidle_reflect(dev, next_state);
+		local_irq_enable();
+		return 0;
+	}
+
+	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+
+	if (broadcast &&
+	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
+		return -EBUSY;
+
+	trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+	entered_state = cpuidle_enter(drv, dev, next_state);
+
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+	if (broadcast)
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+	/* give the governor an opportunity to reflect on the outcome */
+	cpuidle_reflect(dev, entered_state);
+
+	return 0;
+}
+#else
+static inline int cpuidle_idle_call(void)
+{
+	return -ENODEV;
+}
+#endif
+
 /*
  * Generic idle loop implementation
  */
-- 
cgit v1.3-14-g43fede


From c8cc7d4de7a4f2fb1f8774ec2de5b49c46c42e64 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 3 Mar 2014 08:48:52 +0100
Subject: sched/idle: Reorganize the idle loop

Now that we have the main cpuidle function in idle.c, move some code from
the idle mainloop to this function for the sake of clarity.

That removes if then else indentation difficult to follow when looking at the
code. This patch does not change the current behavior.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: tglx@linutronix.de
Cc: rjw@rjwysocki.net
Cc: preeti@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1393832934-11625-3-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cpuidle.h |  2 ++
 kernel/sched/idle.c     | 33 +++++++++++++++------------------
 2 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8d97962d6d64..b0238cba440b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -180,6 +180,8 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
+static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
+	struct cpuidle_device *dev) {return NULL; }
 #endif
 
 #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d5aaf5eb4531..dc8a2466418f 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -63,7 +63,6 @@ void __weak arch_cpu_idle(void)
 	local_irq_enable();
 }
 
-#ifdef CONFIG_CPU_IDLE
 /**
  * cpuidle_idle_call - the main idle function
  *
@@ -77,9 +76,14 @@ static int cpuidle_idle_call(void)
 	int next_state, entered_state, ret;
 	bool broadcast;
 
+	stop_critical_timings();
+	rcu_idle_enter();
+
 	ret = cpuidle_enabled(drv, dev);
-	if (ret < 0)
-		return ret;
+	if (ret < 0) {
+		arch_cpu_idle();
+		goto out;
+	}
 
 	/* ask the governor for the next state */
 	next_state = cpuidle_select(drv, dev);
@@ -89,7 +93,7 @@ static int cpuidle_idle_call(void)
 		/* give the governor an opportunity to reflect on the outcome */
 		cpuidle_reflect(dev, next_state);
 		local_irq_enable();
-		return 0;
+		goto out;
 	}
 
 	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
@@ -109,15 +113,15 @@ static int cpuidle_idle_call(void)
 
 	/* give the governor an opportunity to reflect on the outcome */
 	cpuidle_reflect(dev, entered_state);
+out:
+	if (WARN_ON_ONCE(irqs_disabled()))
+		local_irq_enable();
+
+	rcu_idle_exit();
+	start_critical_timings();
 
 	return 0;
 }
-#else
-static inline int cpuidle_idle_call(void)
-{
-	return -ENODEV;
-}
-#endif
 
 /*
  * Generic idle loop implementation
@@ -150,14 +154,7 @@ static void cpu_idle_loop(void)
 				cpu_idle_poll();
 			} else {
 				if (!current_clr_polling_and_test()) {
-					stop_critical_timings();
-					rcu_idle_enter();
-					if (cpuidle_idle_call())
-						arch_cpu_idle();
-					if (WARN_ON_ONCE(irqs_disabled()))
-						local_irq_enable();
-					rcu_idle_exit();
-					start_critical_timings();
+					cpuidle_idle_call();
 				} else {
 					local_irq_enable();
 				}
-- 
cgit v1.3-14-g43fede


From 8ca3c6424f4988fc19ed1067b121fbaf2e884d77 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 3 Mar 2014 08:48:53 +0100
Subject: sched/idle: Move idle conditions in cpuidle_idle main function

This patch moves the condition before entering idle into the cpuidle main
function located in idle.c. That simplify the idle mainloop functions and
increase the readibility of the conditions to enter truly idle.

This patch is code reorganization and does not change the behavior of the
function.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: tglx@linutronix.de
Cc: rjw@rjwysocki.net
Cc: nicolas.pitre@linaro.org
Cc: preeti@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1393832934-11625-4-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 78 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index dc8a2466418f..cc7a6f3801ff 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -76,44 +76,59 @@ static int cpuidle_idle_call(void)
 	int next_state, entered_state, ret;
 	bool broadcast;
 
+	if (current_clr_polling_and_test()) {
+		local_irq_enable();
+		__current_set_polling();
+		return 0;
+	}
+
 	stop_critical_timings();
 	rcu_idle_enter();
 
 	ret = cpuidle_enabled(drv, dev);
-	if (ret < 0) {
-		arch_cpu_idle();
-		goto out;
-	}
 
-	/* ask the governor for the next state */
-	next_state = cpuidle_select(drv, dev);
+	if (!ret) {
+		/* ask the governor for the next state */
+		next_state = cpuidle_select(drv, dev);
 
-	if (need_resched()) {
-		dev->last_residency = 0;
-		/* give the governor an opportunity to reflect on the outcome */
-		cpuidle_reflect(dev, next_state);
-		local_irq_enable();
-		goto out;
-	}
+		if (current_clr_polling_and_test()) {
+			dev->last_residency = 0;
+			entered_state = next_state;
+			local_irq_enable();
+		} else {
+			broadcast = !!(drv->states[next_state].flags &
+				       CPUIDLE_FLAG_TIMER_STOP);
+
+			if (broadcast)
+				ret = clockevents_notify(
+					CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
+					&dev->cpu);
 
-	broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+			if (!ret) {
+				trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
-	if (broadcast &&
-	    clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
-		return -EBUSY;
+				entered_state = cpuidle_enter(drv, dev,
+							      next_state);
 
-	trace_cpu_idle_rcuidle(next_state, dev->cpu);
+				trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
+						       dev->cpu);
 
-	entered_state = cpuidle_enter(drv, dev, next_state);
+				if (broadcast)
+					clockevents_notify(
+						CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
+						&dev->cpu);
 
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+				/* give the governor an opportunity to reflect on the outcome */
+				cpuidle_reflect(dev, entered_state);
+			}
+		}
+	}
+
+	if (ret)
+		arch_cpu_idle();
 
-	if (broadcast)
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+	__current_set_polling();
 
-	/* give the governor an opportunity to reflect on the outcome */
-	cpuidle_reflect(dev, entered_state);
-out:
 	if (WARN_ON_ONCE(irqs_disabled()))
 		local_irq_enable();
 
@@ -150,16 +165,11 @@ static void cpu_idle_loop(void)
 			 * know that the IPI is going to arrive right
 			 * away
 			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+			if (cpu_idle_force_poll || tick_check_broadcast_expired())
 				cpu_idle_poll();
-			} else {
-				if (!current_clr_polling_and_test()) {
-					cpuidle_idle_call();
-				} else {
-					local_irq_enable();
-				}
-				__current_set_polling();
-			}
+			else
+				cpuidle_idle_call();
+
 			arch_cpu_idle_exit();
 		}
 
-- 
cgit v1.3-14-g43fede


From a1d028bd6d2b7789d15eddfd07c5bea2aaf36040 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Mon, 3 Mar 2014 08:48:54 +0100
Subject: sched/idle: Add more comments to the code

The idle main function is a complex and a critical function. Added more
comments to the code.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Nicolas Pitre <nico@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: tglx@linutronix.de
Cc: rjw@rjwysocki.net
Cc: preeti@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1393832934-11625-5-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/idle.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 57 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cc7a6f3801ff..8f4390a079c7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -76,21 +76,49 @@ static int cpuidle_idle_call(void)
 	int next_state, entered_state, ret;
 	bool broadcast;
 
+	/*
+	 * Check if the idle task must be rescheduled. If it is the
+	 * case, exit the function after re-enabling the local irq and
+	 * set again the polling flag
+	 */
 	if (current_clr_polling_and_test()) {
 		local_irq_enable();
 		__current_set_polling();
 		return 0;
 	}
 
+	/*
+	 * During the idle period, stop measuring the disabled irqs
+	 * critical sections latencies
+	 */
 	stop_critical_timings();
+
+	/*
+	 * Tell the RCU framework we are entering an idle section,
+	 * so no more rcu read side critical sections and one more
+	 * step to the grace period
+	 */
 	rcu_idle_enter();
 
+	/*
+	 * Check if the cpuidle framework is ready, otherwise fallback
+	 * to the default arch specific idle method
+	 */
 	ret = cpuidle_enabled(drv, dev);
 
 	if (!ret) {
-		/* ask the governor for the next state */
+		/*
+		 * Ask the governor to choose an idle state it thinks
+		 * it is convenient to go to. There is *always* a
+		 * convenient idle state
+		 */
 		next_state = cpuidle_select(drv, dev);
 
+		/*
+		 * The idle task must be scheduled, it is pointless to
+		 * go to idle, just update no idle residency and get
+		 * out of this function
+		 */
 		if (current_clr_polling_and_test()) {
 			dev->last_residency = 0;
 			entered_state = next_state;
@@ -100,6 +128,14 @@ static int cpuidle_idle_call(void)
 				       CPUIDLE_FLAG_TIMER_STOP);
 
 			if (broadcast)
+				/*
+				 * Tell the time framework to switch
+				 * to a broadcast timer because our
+				 * local timer will be shutdown. If a
+				 * local timer is used from another
+				 * cpu as a broadcast timer, this call
+				 * may fail if it is not available
+				 */
 				ret = clockevents_notify(
 					CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
 					&dev->cpu);
@@ -107,6 +143,14 @@ static int cpuidle_idle_call(void)
 			if (!ret) {
 				trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
+				/*
+				 * Enter the idle state previously
+				 * returned by the governor
+				 * decision. This function will block
+				 * until an interrupt occurs and will
+				 * take care of re-enabling the local
+				 * interrupts
+				 */
 				entered_state = cpuidle_enter(drv, dev,
 							      next_state);
 
@@ -118,17 +162,28 @@ static int cpuidle_idle_call(void)
 						CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
 						&dev->cpu);
 
-				/* give the governor an opportunity to reflect on the outcome */
+				/*
+				 * Give the governor an opportunity to reflect on the
+				 * outcome
+				 */
 				cpuidle_reflect(dev, entered_state);
 			}
 		}
 	}
 
+	/*
+	 * We can't use the cpuidle framework, let's use the default
+	 * idle routine
+	 */
 	if (ret)
 		arch_cpu_idle();
 
 	__current_set_polling();
 
+	/*
+	 * It is up to the idle functions to enable back the local
+	 * interrupt
+	 */
 	if (WARN_ON_ONCE(irqs_disabled()))
 		local_irq_enable();
 
-- 
cgit v1.3-14-g43fede


From cfa77bc4af2c75c0781ee76cde2dd104c6c8e2b7 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 2 Mar 2014 16:56:38 +0100
Subject: perf: Disallow user-space callchains for function trace events

Recent issues with user space callchains processing within
page fault handler tracing showed as Peter said 'there's
just too much fail surface'.

Related list discussions:

  http://marc.info/?t=139302086500001&r=1&w=2
  http://marc.info/?t=139301437300003&r=1&w=2

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1393775800-13524-2-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e033..d5e01c3f4e69 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 	}
 
 	/* The ftrace function trace is allowed only for root. */
-	if (ftrace_event_is_function(tp_event) &&
-	    perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	if (ftrace_event_is_function(tp_event)) {
+		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		/*
+		 * We don't allow user space callchains for  function trace
+		 * event, due to issues with page faults while tracing page
+		 * fault handler and its overall trickiness nature.
+		 */
+		if (!p_event->attr.exclude_callchain_user)
+			return -EINVAL;
+	}
 
 	/* No tracing, just counting, so no obvious leak */
 	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
-- 
cgit v1.3-14-g43fede


From 63c45f4ba533e9749da16298db53e491c25d805b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Sun, 2 Mar 2014 16:56:39 +0100
Subject: perf: Disallow user-space stack dumps for function trace events

Recent issues with user space callchains processing within
page fault handler tracing showed as Peter said 'there's
just too much fail surface'.

The user space stack dump is just another source of the this issue.

Related list discussions:
  http://marc.info/?t=139302086500001&r=1&w=2
  http://marc.info/?t=139301437300003&r=1&w=2

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1393775800-13524-3-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/trace/trace_event_perf.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index d5e01c3f4e69..c894614de14d 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -42,6 +42,13 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
 		 */
 		if (!p_event->attr.exclude_callchain_user)
 			return -EINVAL;
+
+		/*
+		 * Same reason to disable user stack dump as for user space
+		 * callchains above.
+		 */
+		if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+			return -EINVAL;
 	}
 
 	/* No tracing, just counting, so no obvious leak */
-- 
cgit v1.3-14-g43fede


From 734ff2a71f9e6aa6fedfa5a9a34818b8586516d5 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Tue, 4 Mar 2014 19:25:46 +0400
Subject: sched/rt: Fix picking RT and DL tasks from empty queue

The problems:

1) We check for rt_nr_running before call of put_prev_task().
   If previous task is RT, its rt_rq may become throttled
   and dequeued after this call.

In case of p is from rt->rq this just causes picking a task
from throttled queue, but in case of its rt_rq is child
we are guaranteed catch BUG_ON.

2) The same with deadline class. The only difference we operate
   on only dl_rq.

This patch fixes all the above problems and it adds a small skip in the
DL update like we've already done for RT class:

	if (unlikely((s64)delta_exec <= 0))
		return;

This will optimize sequential update_curr_dl() calls a little.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Link: http://lkml.kernel.org/r/1393946746.3643.3.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/deadline.c | 10 ++++++++--
 kernel/sched/rt.c       |  7 +++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e4f3ac3b8514..27ef40925525 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -609,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
 	 * approach need further study.
 	 */
 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
-	if (unlikely((s64)delta_exec < 0))
-		delta_exec = 0;
+	if (unlikely((s64)delta_exec <= 0))
+		return;
 
 	schedstat_set(curr->se.statistics.exec_max,
 		      max(curr->se.statistics.exec_max, delta_exec));
@@ -1023,6 +1023,12 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 
 	if (need_pull_dl_task(rq, prev))
 		pull_dl_task(rq);
+	/*
+	 * When prev is DL, we may throttle it in put_prev_task().
+	 * So, we update time before we check for dl_nr_running.
+	 */
+	if (prev->sched_class == &dl_sched_class)
+		update_curr_dl(rq);
 
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index facc824334fb..f3cee0a63b76 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1379,6 +1379,13 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 			return RETRY_TASK;
 	}
 
+	/*
+	 * We may dequeue prev's rt_rq in put_prev_task().
+	 * So, we update time before rt_nr_running check.
+	 */
+	if (prev->sched_class == &rt_sched_class)
+		update_curr_rt(rq);
+
 	if (!rt_rq->rt_nr_running)
 		return NULL;
 
-- 
cgit v1.3-14-g43fede


From e4aa358b6c23f98b2715594f6b1e9a4996a55f04 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Thu, 6 Mar 2014 13:31:55 +0400
Subject: sched/fair: Push down check for high priority class task into
 idle_balance()

We close idle_exit_fair() bracket in case of we've pulled something or we've received
task of high priority class.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/1394098315.19290.10.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c      | 15 ++++++++++-----
 kernel/sched/idle_task.c |  1 -
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d8482e1c575e..b956e70fc503 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4787,17 +4787,16 @@ simple:
 	return p;
 
 idle:
+	new_tasks = idle_balance(rq);
 	/*
 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
 	 * possible for any higher priority task to appear. In that case we
 	 * must re-start the pick_next_entity() loop.
 	 */
-	new_tasks = idle_balance(rq);
-
-	if (rq->nr_running != rq->cfs.h_nr_running)
+	if (new_tasks < 0)
 		return RETRY_TASK;
 
-	if (new_tasks)
+	if (new_tasks > 0)
 		goto again;
 
 	return NULL;
@@ -6728,8 +6727,14 @@ static int idle_balance(struct rq *this_rq)
 		this_rq->max_idle_balance_cost = curr_cost;
 
 out:
-	if (pulled_task)
+	/* Is there a task of a high priority class? */
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+		pulled_task = -1;
+
+	if (pulled_task) {
+		idle_exit_fair(this_rq);
 		this_rq->idle_stamp = 0;
+	}
 
 	return pulled_task;
 }
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 1f3725882838..879f2b75266a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -29,7 +29,6 @@ pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 	put_prev_task(rq, prev);
 
 	schedstat_inc(rq, sched_goidle);
-	idle_enter_fair(rq);
 	return rq->idle;
 }
 
-- 
cgit v1.3-14-g43fede


From 4c6c4e38c4e9a454889298dcc498174968d14a09 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Thu, 6 Mar 2014 13:32:01 +0400
Subject: sched/core: Fix endless loop in pick_next_task()

1) Single cpu machine case.

When rq has only RT tasks, but no one of them can be picked
because of throttling, we enter in endless loop.

pick_next_task_{dl,rt} return NULL.

In pick_next_task_fair() we permanently go to retry

	if (rq->nr_running != rq->cfs.h_nr_running)
		return RETRY_TASK;

(rq->nr_running is not being decremented when rt_rq becomes
throttled).

No chances to unthrottle any rt_rq or to wake fair here,
because of rq is locked permanently and interrupts are
disabled.

2) In case of SMP this can cause a hang too. Although we unlock
   rq in idle_balance(), interrupts are still disabled.

The solution is to check for available tasks in DL and RT
classes instead of checking for sum.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1394098321.19290.11.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c  |  4 +++-
 kernel/sched/rt.c    | 10 ----------
 kernel/sched/sched.h | 12 ++++++++++++
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b956e70fc503..10db4a87ad72 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6728,7 +6728,9 @@ static int idle_balance(struct rq *this_rq)
 
 out:
 	/* Is there a task of a high priority class? */
-	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+	if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
+	    (this_rq->dl.dl_nr_running ||
+	     (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
 		pulled_task = -1;
 
 	if (pulled_task) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f3cee0a63b76..d8cdf1618551 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -470,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 		dequeue_rt_entity(rt_se);
 }
 
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-
 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 {
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -545,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 {
 }
 
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
-	return rt_rq->rt_throttled;
-}
-
 static inline const struct cpumask *sched_rt_period_mask(void)
 {
 	return cpu_online_mask;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 378bff76267f..f2de7a175620 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -423,6 +423,18 @@ struct rt_rq {
 #endif
 };
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+#else
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+	return rt_rq->rt_throttled;
+}
+#endif
+
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
 	/* runqueue is an rbtree, ordered by deadline */
-- 
cgit v1.3-14-g43fede


From 35805ff8f4fc535ac85330170d3c56829c87c677 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@parallels.com>
Date: Thu, 6 Mar 2014 19:16:15 +0400
Subject: sched/fair: Fix endless loop in idle_balance()

Check for fair tasks number to decide, that we've pulled a task.
rq's nr_running may contain throttled RT tasks.

Signed-off-by: Kirill Tkhai <ktkhai@parallels.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1394118975.19290.104.camel@tkhai
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 10db4a87ad72..f1eedae1e83e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6710,7 +6710,7 @@ static int idle_balance(struct rq *this_rq)
 	 * While browsing the domains, we released the rq lock.
 	 * A task could have be enqueued in the meantime
 	 */
-	if (this_rq->nr_running && !pulled_task) {
+	if (this_rq->cfs.h_nr_running && !pulled_task) {
 		pulled_task = 1;
 		goto out;
 	}
-- 
cgit v1.3-14-g43fede


From 156654f491dd8d52687a5fbe1637f472a52ce75b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <bitbucket@online.de>
Date: Fri, 28 Feb 2014 07:23:11 +0100
Subject: sched/numa: Move task_numa_free() to __put_task_struct()

Bad idea on -rt:

[  908.026136]  [<ffffffff8150ad6a>] rt_spin_lock_slowlock+0xaa/0x2c0
[  908.026145]  [<ffffffff8108f701>] task_numa_free+0x31/0x130
[  908.026151]  [<ffffffff8108121e>] finish_task_switch+0xce/0x100
[  908.026156]  [<ffffffff81509c0a>] thread_return+0x48/0x4ae
[  908.026160]  [<ffffffff8150a095>] schedule+0x25/0xa0
[  908.026163]  [<ffffffff8150ad95>] rt_spin_lock_slowlock+0xd5/0x2c0
[  908.026170]  [<ffffffff810658cf>] get_signal_to_deliver+0xaf/0x680
[  908.026175]  [<ffffffff8100242d>] do_signal+0x3d/0x5b0
[  908.026179]  [<ffffffff81002a30>] do_notify_resume+0x90/0xe0
[  908.026186]  [<ffffffff81513176>] int_signal+0x12/0x17
[  908.026193]  [<00007ff2a388b1d0>] 0x7ff2a388b1cf

and since upstream does not mind where we do this, be a bit nicer ...

Signed-off-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Mel Gorman <mgorman@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1393568591.6018.27.camel@marge.simpson.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/fork.c       | 1 +
 kernel/sched/core.c | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..332688e5e7b4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	task_numa_free(tsk);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dd89c27bb56f..9e126a21c5c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2151,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_state == TASK_DEAD)) {
-		task_numa_free(prev);
-
 		if (prev->sched_class->task_dead)
 			prev->sched_class->task_dead(prev);
 
-- 
cgit v1.3-14-g43fede


From c9122da1e2d29bd6a1475a0d1ce2aa6ac6ea25fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Feb 2014 13:32:16 +0100
Subject: locking: Move mcs_spinlock.h into kernel/locking/

The mcs_spinlock code is not meant (or suitable) as a generic locking
primitive, therefore take it away from the normal includes and place
it in kernel/locking/.

This way the locking primitives implemented there can use it as part
of their implementation but we do not risk it getting used
inapropriately.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-byirmpamgr7h25m5kyavwpzx@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mcs_spinlock.h  | 114 ------------------------------------------
 kernel/locking/mcs_spinlock.h | 114 ++++++++++++++++++++++++++++++++++++++++++
 kernel/locking/mutex.c        |   2 +-
 3 files changed, 115 insertions(+), 115 deletions(-)
 delete mode 100644 include/linux/mcs_spinlock.h
 create mode 100644 kernel/locking/mcs_spinlock.h

(limited to 'kernel')

diff --git a/include/linux/mcs_spinlock.h b/include/linux/mcs_spinlock.h
deleted file mode 100644
index f2a5c6360083..000000000000
--- a/include/linux/mcs_spinlock.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * MCS lock defines
- *
- * This file contains the main data structure and API definitions of MCS lock.
- *
- * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
- * with the desirable properties of being fair, and with each cpu trying
- * to acquire the lock spinning on a local variable.
- * It avoids expensive cache bouncings that common test-and-set spin-lock
- * implementations incur.
- */
-#ifndef __LINUX_MCS_SPINLOCK_H
-#define __LINUX_MCS_SPINLOCK_H
-
-#include <asm/mcs_spinlock.h>
-
-struct mcs_spinlock {
-	struct mcs_spinlock *next;
-	int locked; /* 1 if lock acquired */
-};
-
-#ifndef arch_mcs_spin_lock_contended
-/*
- * Using smp_load_acquire() provides a memory barrier that ensures
- * subsequent operations happen after the lock is acquired.
- */
-#define arch_mcs_spin_lock_contended(l)					\
-do {									\
-	while (!(smp_load_acquire(l)))					\
-		arch_mutex_cpu_relax();					\
-} while (0)
-#endif
-
-#ifndef arch_mcs_spin_unlock_contended
-/*
- * smp_store_release() provides a memory barrier to ensure all
- * operations in the critical section has been completed before
- * unlocking.
- */
-#define arch_mcs_spin_unlock_contended(l)				\
-	smp_store_release((l), 1)
-#endif
-
-/*
- * Note: the smp_load_acquire/smp_store_release pair is not
- * sufficient to form a full memory barrier across
- * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
- * For applications that need a full barrier across multiple cpus
- * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
- * used after mcs_lock.
- */
-
-/*
- * In order to acquire the lock, the caller should declare a local node and
- * pass a reference of the node to this function in addition to the lock.
- * If the lock has already been acquired, then this will proceed to spin
- * on this node->locked until the previous lock holder sets the node->locked
- * in mcs_spin_unlock().
- *
- * We don't inline mcs_spin_lock() so that perf can correctly account for the
- * time spent in this lock function.
- */
-static inline
-void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
-	struct mcs_spinlock *prev;
-
-	/* Init node */
-	node->locked = 0;
-	node->next   = NULL;
-
-	prev = xchg(lock, node);
-	if (likely(prev == NULL)) {
-		/*
-		 * Lock acquired, don't need to set node->locked to 1. Threads
-		 * only spin on its own node->locked value for lock acquisition.
-		 * However, since this thread can immediately acquire the lock
-		 * and does not proceed to spin on its own node->locked, this
-		 * value won't be used. If a debug mode is needed to
-		 * audit lock status, then set node->locked value here.
-		 */
-		return;
-	}
-	ACCESS_ONCE(prev->next) = node;
-
-	/* Wait until the lock holder passes the lock down. */
-	arch_mcs_spin_lock_contended(&node->locked);
-}
-
-/*
- * Releases the lock. The caller should pass in the corresponding node that
- * was used to acquire the lock.
- */
-static inline
-void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
-{
-	struct mcs_spinlock *next = ACCESS_ONCE(node->next);
-
-	if (likely(!next)) {
-		/*
-		 * Release the lock by setting it to NULL
-		 */
-		if (likely(cmpxchg(lock, node, NULL) == node))
-			return;
-		/* Wait until the next pointer is set */
-		while (!(next = ACCESS_ONCE(node->next)))
-			arch_mutex_cpu_relax();
-	}
-
-	/* Pass lock to next waiter. */
-	arch_mcs_spin_unlock_contended(&next->locked);
-}
-
-#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 000000000000..f2a5c6360083
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,114 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+	struct mcs_spinlock *next;
+	int locked; /* 1 if lock acquired */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l)					\
+do {									\
+	while (!(smp_load_acquire(l)))					\
+		arch_mutex_cpu_relax();					\
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l)				\
+	smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+	struct mcs_spinlock *prev;
+
+	/* Init node */
+	node->locked = 0;
+	node->next   = NULL;
+
+	prev = xchg(lock, node);
+	if (likely(prev == NULL)) {
+		/*
+		 * Lock acquired, don't need to set node->locked to 1. Threads
+		 * only spin on its own node->locked value for lock acquisition.
+		 * However, since this thread can immediately acquire the lock
+		 * and does not proceed to spin on its own node->locked, this
+		 * value won't be used. If a debug mode is needed to
+		 * audit lock status, then set node->locked value here.
+		 */
+		return;
+	}
+	ACCESS_ONCE(prev->next) = node;
+
+	/* Wait until the lock holder passes the lock down. */
+	arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+	struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+	if (likely(!next)) {
+		/*
+		 * Release the lock by setting it to NULL
+		 */
+		if (likely(cmpxchg(lock, node, NULL) == node))
+			return;
+		/* Wait until the next pointer is set */
+		while (!(next = ACCESS_ONCE(node->next)))
+			arch_mutex_cpu_relax();
+	}
+
+	/* Pass lock to next waiter. */
+	arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 45fe1b5293d6..4f408be39a07 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,7 +25,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include <linux/mcs_spinlock.h>
+#include "mcs_spinlock.h"
 
 /*
  * In the DEBUG case we are using the "NULL fastpath" for mutexes,
-- 
cgit v1.3-14-g43fede


From 46af29e479cc0c1c63633007993af5292c2c3e75 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Jan 2014 11:13:12 -0800
Subject: locking/mutexes: Return false if task need_resched() in
 mutex_can_spin_on_owner()

The mutex_can_spin_on_owner() function should also return false if the
task needs to be rescheduled to avoid entering the MCS queue when it
needs to reschedule.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Waiman.Long@hp.com
Cc: torvalds@linux-foundation.org
Cc: tglx@linutronix.de
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
Cc: davidlohr@hp.com
Cc: hpa@zytor.com
Cc: andi@firstfloor.org
Cc: aswin@hp.com
Cc: scott.norton@hp.com
Cc: chegu_vinod@hp.com
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1390936396-3962-2-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4f408be39a07..e6d646b18d6c 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -166,6 +166,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
 	struct task_struct *owner;
 	int retval = 1;
 
+	if (need_resched())
+		return 0;
+
 	rcu_read_lock();
 	owner = ACCESS_ONCE(lock->owner);
 	if (owner)
-- 
cgit v1.3-14-g43fede


From 47667fa1502e4d759df87e9cc7fbc0f202483361 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Jan 2014 11:13:13 -0800
Subject: locking/mutexes: Modify the way optimistic spinners are queued

The mutex->spin_mlock was introduced in order to ensure that only 1 thread
spins for lock acquisition at a time to reduce cache line contention. When
lock->owner is NULL and the lock->count is still not 1, the spinner(s) will
continually release and obtain the lock->spin_mlock. This can generate
quite a bit of overhead/contention, and also might just delay the spinner
from getting the lock.

This patch modifies the way optimistic spinners are queued by queuing before
entering the optimistic spinning loop as oppose to acquiring before every
call to mutex_spin_on_owner(). So in situations where the spinner requires
a few extra spins before obtaining the lock, then there will only be 1 spinner
trying to get the lock and it will avoid the overhead from unnecessarily
unlocking and locking the spin_mlock.

Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: tglx@linutronix.de
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
Cc: davidlohr@hp.com
Cc: hpa@zytor.com
Cc: andi@firstfloor.org
Cc: aswin@hp.com
Cc: scott.norton@hp.com
Cc: chegu_vinod@hp.com
Cc: Waiman.Long@hp.com
Cc: paulmck@linux.vnet.ibm.com
Cc: torvalds@linux-foundation.org
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1390936396-3962-3-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index e6d646b18d6c..82dad2ccd40b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -403,9 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (!mutex_can_spin_on_owner(lock))
 		goto slowpath;
 
+	mcs_spin_lock(&lock->mcs_lock, &node);
 	for (;;) {
 		struct task_struct *owner;
-		struct mcs_spinlock  node;
 
 		if (use_ww_ctx && ww_ctx->acquired > 0) {
 			struct ww_mutex *ww;
@@ -420,19 +420,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			 * performed the optimistic spinning cannot be done.
 			 */
 			if (ACCESS_ONCE(ww->ctx))
-				goto slowpath;
+				break;
 		}
 
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
-		mcs_spin_lock(&lock->mcs_lock, &node);
 		owner = ACCESS_ONCE(lock->owner);
-		if (owner && !mutex_spin_on_owner(lock, owner)) {
-			mcs_spin_unlock(&lock->mcs_lock, &node);
-			goto slowpath;
-		}
+		if (owner && !mutex_spin_on_owner(lock, owner))
+			break;
 
 		if ((atomic_read(&lock->count) == 1) &&
 		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -449,7 +446,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			preempt_enable();
 			return 0;
 		}
-		mcs_spin_unlock(&lock->mcs_lock, &node);
 
 		/*
 		 * When there's no owner, we might have preempted between the
@@ -458,7 +454,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 * the owner complete.
 		 */
 		if (!owner && (need_resched() || rt_task(task)))
-			goto slowpath;
+			break;
 
 		/*
 		 * The cpu_relax() call is a compiler barrier which forces
@@ -468,6 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 */
 		arch_mutex_cpu_relax();
 	}
+	mcs_spin_unlock(&lock->mcs_lock, &node);
 slowpath:
 #endif
 	spin_lock_mutex(&lock->wait_lock, flags);
-- 
cgit v1.3-14-g43fede


From 1d8fe7dc8078b23e060ec62ccb4cdc1ac3c41bf8 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Tue, 28 Jan 2014 11:13:14 -0800
Subject: locking/mutexes: Unlock the mutex without the wait_lock

When running workloads that have high contention in mutexes on an 8 socket
machine, mutex spinners would often spin for a long time with no lock owner.

The main reason why this is occuring is in __mutex_unlock_common_slowpath(),
if __mutex_slowpath_needs_to_unlock(), then the owner needs to acquire the
mutex->wait_lock before releasing the mutex (setting lock->count to 1). When
the wait_lock is contended, this delays the mutex from being released.
We should be able to release the mutex without holding the wait_lock.

Signed-off-by: Jason Low <jason.low2@hp.com>
Cc: chegu_vinod@hp.com
Cc: paulmck@linux.vnet.ibm.com
Cc: Waiman.Long@hp.com
Cc: torvalds@linux-foundation.org
Cc: tglx@linutronix.de
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
Cc: davidlohr@hp.com
Cc: hpa@zytor.com
Cc: andi@firstfloor.org
Cc: aswin@hp.com
Cc: scott.norton@hp.com
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1390936396-3962-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 82dad2ccd40b..dc3d6f2bbe2a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -671,10 +671,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 	unsigned long flags;
 
-	spin_lock_mutex(&lock->wait_lock, flags);
-	mutex_release(&lock->dep_map, nested, _RET_IP_);
-	debug_mutex_unlock(lock);
-
 	/*
 	 * some architectures leave the lock unlocked in the fastpath failure
 	 * case, others need to leave it locked. In the later case we have to
@@ -683,6 +679,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
 	if (__mutex_slowpath_needs_to_unlock())
 		atomic_set(&lock->count, 1);
 
+	spin_lock_mutex(&lock->wait_lock, flags);
+	mutex_release(&lock->dep_map, nested, _RET_IP_);
+	debug_mutex_unlock(lock);
+
 	if (!list_empty(&lock->wait_list)) {
 		/* get the first entry from the wait-list: */
 		struct mutex_waiter *waiter =
-- 
cgit v1.3-14-g43fede


From fb0527bd5ea99bfeb2dd91e3c1433ecf745d6b99 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 29 Jan 2014 12:51:42 +0100
Subject: locking/mutexes: Introduce cancelable MCS lock for adaptive spinning

Since we want a task waiting for a mutex_lock() to go to sleep and
reschedule on need_resched() we must be able to abort the
mcs_spin_lock() around the adaptive spin.

Therefore implement a cancelable mcs lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: chegu_vinod@hp.com
Cc: paulmck@linux.vnet.ibm.com
Cc: Waiman.Long@hp.com
Cc: torvalds@linux-foundation.org
Cc: tglx@linutronix.de
Cc: riel@redhat.com
Cc: akpm@linux-foundation.org
Cc: davidlohr@hp.com
Cc: hpa@zytor.com
Cc: andi@firstfloor.org
Cc: aswin@hp.com
Cc: scott.norton@hp.com
Cc: Jason Low <jason.low2@hp.com>
Link: http://lkml.kernel.org/n/tip-62hcl5wxydmjzd182zhvk89m@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h         |   4 +-
 kernel/locking/Makefile       |   2 +-
 kernel/locking/mcs_spinlock.c | 178 ++++++++++++++++++++++++++++++++++++++++++
 kernel/locking/mcs_spinlock.h |  15 ++++
 kernel/locking/mutex.c        |  10 ++-
 5 files changed, 202 insertions(+), 7 deletions(-)
 create mode 100644 kernel/locking/mcs_spinlock.c

(limited to 'kernel')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index c482e1d2cc49..11692dea18aa 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
-struct mcs_spinlock;
+struct optimistic_spin_queue;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -56,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct mcs_spinlock	*mcs_lock;	/* Spinner MCS lock */
+	struct optimistic_spin_queue	*osq;	/* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f66..2a9ee96ecf00 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 000000000000..838dc9e00669
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
+
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_SMP
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_queue *
+osq_wait_next(struct optimistic_spin_queue **lock,
+	      struct optimistic_spin_queue *node,
+	      struct optimistic_spin_queue *prev)
+{
+	struct optimistic_spin_queue *next = NULL;
+
+	for (;;) {
+		if (*lock == node && cmpxchg(lock, node, prev) == node) {
+			/*
+			 * We were the last queued, we moved @lock back. @prev
+			 * will now observe @lock and will complete its
+			 * unlock()/unqueue().
+			 */
+			break;
+		}
+
+		/*
+		 * We must xchg() the @node->next value, because if we were to
+		 * leave it in, a concurrent unlock()/unqueue() from
+		 * @node->next might complete Step-A and think its @prev is
+		 * still valid.
+		 *
+		 * If the concurrent unlock()/unqueue() wins the race, we'll
+		 * wait for either @lock to point to us, through its Step-B, or
+		 * wait for a new @node->next from its Step-C.
+		 */
+		if (node->next) {
+			next = xchg(&node->next, NULL);
+			if (next)
+				break;
+		}
+
+		arch_mutex_cpu_relax();
+	}
+
+	return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue **lock)
+{
+	struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_queue *prev, *next;
+
+	node->locked = 0;
+	node->next = NULL;
+
+	node->prev = prev = xchg(lock, node);
+	if (likely(prev == NULL))
+		return true;
+
+	ACCESS_ONCE(prev->next) = node;
+
+	/*
+	 * Normally @prev is untouchable after the above store; because at that
+	 * moment unlock can proceed and wipe the node element from stack.
+	 *
+	 * However, since our nodes are static per-cpu storage, we're
+	 * guaranteed their existence -- this allows us to apply
+	 * cmpxchg in an attempt to undo our queueing.
+	 */
+
+	while (!smp_load_acquire(&node->locked)) {
+		/*
+		 * If we need to reschedule bail... so we can block.
+		 */
+		if (need_resched())
+			goto unqueue;
+
+		arch_mutex_cpu_relax();
+	}
+	return true;
+
+unqueue:
+	/*
+	 * Step - A  -- stabilize @prev
+	 *
+	 * Undo our @prev->next assignment; this will make @prev's
+	 * unlock()/unqueue() wait for a next pointer since @lock points to us
+	 * (or later).
+	 */
+
+	for (;;) {
+		if (prev->next == node &&
+		    cmpxchg(&prev->next, node, NULL) == node)
+			break;
+
+		/*
+		 * We can only fail the cmpxchg() racing against an unlock(),
+		 * in which case we should observe @node->locked becomming
+		 * true.
+		 */
+		if (smp_load_acquire(&node->locked))
+			return true;
+
+		arch_mutex_cpu_relax();
+
+		/*
+		 * Or we race against a concurrent unqueue()'s step-B, in which
+		 * case its step-C will write us a new @node->prev pointer.
+		 */
+		prev = ACCESS_ONCE(node->prev);
+	}
+
+	/*
+	 * Step - B -- stabilize @next
+	 *
+	 * Similar to unlock(), wait for @node->next or move @lock from @node
+	 * back to @prev.
+	 */
+
+	next = osq_wait_next(lock, node, prev);
+	if (!next)
+		return false;
+
+	/*
+	 * Step - C -- unlink
+	 *
+	 * @prev is stable because its still waiting for a new @prev->next
+	 * pointer, @next is stable because our @node->next pointer is NULL and
+	 * it will wait in Step-A.
+	 */
+
+	ACCESS_ONCE(next->prev) = prev;
+	ACCESS_ONCE(prev->next) = next;
+
+	return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue **lock)
+{
+	struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_queue *next;
+
+	/*
+	 * Fast path for the uncontended case.
+	 */
+	if (likely(cmpxchg(lock, node, NULL) == node))
+		return;
+
+	/*
+	 * Second most likely case.
+	 */
+	next = xchg(&node->next, NULL);
+	if (next) {
+		ACCESS_ONCE(next->locked) = 1;
+		return;
+	}
+
+	next = osq_wait_next(lock, node, NULL);
+	if (next)
+		ACCESS_ONCE(next->locked) = 1;
+}
+
+#endif
+
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index f2a5c6360083..a2dbac4aca6b 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -111,4 +111,19 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 	arch_mcs_spin_unlock_contended(&next->locked);
 }
 
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+
+struct optimistic_spin_queue {
+	struct optimistic_spin_queue *next, *prev;
+	int locked; /* 1 if lock acquired */
+};
+
+extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue **lock);
+
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dc3d6f2bbe2a..2670b84067d6 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -53,7 +53,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	INIT_LIST_HEAD(&lock->wait_list);
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	lock->mcs_lock = NULL;
+	lock->osq = NULL;
 #endif
 
 	debug_mutex_init(lock, name, key);
@@ -403,7 +403,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	if (!mutex_can_spin_on_owner(lock))
 		goto slowpath;
 
-	mcs_spin_lock(&lock->mcs_lock, &node);
+	if (!osq_lock(&lock->osq))
+		goto slowpath;
+
 	for (;;) {
 		struct task_struct *owner;
 
@@ -442,7 +444,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 			}
 
 			mutex_set_owner(lock);
-			mcs_spin_unlock(&lock->mcs_lock, &node);
+			osq_unlock(&lock->osq);
 			preempt_enable();
 			return 0;
 		}
@@ -464,7 +466,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 */
 		arch_mutex_cpu_relax();
 	}
-	mcs_spin_unlock(&lock->mcs_lock, &node);
+	osq_unlock(&lock->osq);
 slowpath:
 #endif
 	spin_lock_mutex(&lock->wait_lock, flags);
-- 
cgit v1.3-14-g43fede


From 34c6bc2c919a55e5ad4e698510a2f35ee13ab900 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 3 Feb 2014 16:21:09 +0100
Subject: locking/mutexes: Add extra reschedule point

Add in an extra reschedule in an attempt to avoid getting reschedule
the moment we've acquired the lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-zah5eyn9gu7qlgwh9r6n2anc@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 2670b84067d6..02c61a9c8906 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -468,6 +468,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	}
 	osq_unlock(&lock->osq);
 slowpath:
+	/*
+	 * If we fell out of the spin path because of need_resched(),
+	 * reschedule now, before we try-lock the mutex. This avoids getting
+	 * scheduled out right after we obtained the mutex.
+	 */
+	if (need_resched())
+		schedule_preempt_disabled();
 #endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-- 
cgit v1.3-14-g43fede


From db0fbadcbd0c288525ea9f76488b324642a78c7f Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 10 Mar 2014 21:42:11 +0100
Subject: ftrace: Fix compilation warning about control_ops_free

With CONFIG_DYNAMIC_FTRACE=n, I see a warning:
kernel/trace/ftrace.c:240:13: warning: 'control_ops_free' defined but not used
 static void control_ops_free(struct ftrace_ops *ops)
             ^
Move that function around to an already existing #ifdef
CONFIG_DYNAMIC_FTRACE block as the function is used solely from the
dynamic function tracing functions.

Link: http://lkml.kernel.org/r/1394484131-5107-1-git-send-email-jslaby@suse.cz

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0e48ff4cefa5..b4531b228180 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -237,11 +237,6 @@ static int control_ops_alloc(struct ftrace_ops *ops)
 	return 0;
 }
 
-static void control_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void update_global_ops(void)
 {
 	ftrace_func_t func = ftrace_global_list_func;
@@ -2100,6 +2095,11 @@ static ftrace_func_t saved_ftrace_func;
 static int ftrace_start_up;
 static int global_start_up;
 
+static void control_ops_free(struct ftrace_ops *ops)
+{
+	free_percpu(ops->disabled);
+}
+
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
-- 
cgit v1.3-14-g43fede


From 4d4348202b34c130a899b597fec14bbb5d83108d Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@linux-m68k.org>
Date: Tue, 11 Mar 2014 11:23:41 +0100
Subject: PM / Hibernate: Spelling s/anonymouns/anonymous/

Spelling fix.

Signed-off-by: Geert Uytterhoeven <geert+renesas@linux-m68k.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/snapshot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d9f61a145802..149e745eaa52 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1268,7 +1268,7 @@ static void free_unnecessary_pages(void)
  * [number of saveable pages] - [number of pages that can be freed in theory]
  *
  * where the second term is the sum of (1) reclaimable slab pages, (2) active
- * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
+ * and (3) inactive anonymous pages, (4) active and (5) inactive file pages,
  * minus mapped file pages.
  */
 static unsigned long minimum_image_size(unsigned long saveable)
-- 
cgit v1.3-14-g43fede


From af02b5fdb1fb3c5d5f8d71f7f84e4fb243e1ae31 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@linux-m68k.org>
Date: Tue, 11 Mar 2014 12:08:11 +0100
Subject: PM: Add missing "freeze" state

Fix descriptions of /sys/power/state in the documentation and in
a code comment.

Signed-off-by: Geert Uytterhoeven <geert+renesas@linux-m68k.org>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
[rjw: Changelog]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-power | 5 +++--
 kernel/power/main.c                   | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 205a73878441..64c9276e9421 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -12,8 +12,9 @@ Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
 Description:
 		The /sys/power/state file controls the system power state.
 		Reading from this file returns what states are supported,
-		which is hard-coded to 'standby' (Power-On Suspend), 'mem'
-		(Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
+		which is hard-coded to 'freeze' (Low-Power Idle), 'standby'
+		(Power-On Suspend), 'mem' (Suspend-to-RAM), and 'disk'
+		(Suspend-to-Disk).
 
 		Writing to this file one of these strings causes the system to
 		transition into that state. Please see the file
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1d1bf630e6e9..6271bc4073ef 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -282,8 +282,8 @@ struct kobject *power_kobj;
  *	state - control system power state.
  *
  *	show() returns what states are supported, which is hard-coded to
- *	'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
- *	'disk' (Suspend-to-Disk).
+ *	'freeze' (Low-Power Idle), 'standby' (Power-On Suspend),
+ *	'mem' (Suspend-to-RAM), and 'disk' (Suspend-to-Disk).
  *
  *	store() accepts one of those strings, translates it into the
  *	proper enumerated value, and initiates a suspend transition.
-- 
cgit v1.3-14-g43fede


From 4c11628a16506a8a8e030515f601771df07bba97 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 11 Mar 2014 21:32:28 -0400
Subject: tracepoints: API doc update to data argument

Describe the @data argument (probe private data).

Link: http://lkml.kernel.org/r/1394587948-27878-1-git-send-email-mathieu.desnoyers@efficios.com

Fixes: 38516ab59fbc "tracing: Let tracepoints have data passed to tracepoint callbacks"
CC: Ingo Molnar <mingo@kernel.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0058f33d05c1..a4f629da3011 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -376,6 +376,7 @@ tracepoint_add_probe(const char *name, void *probe, void *data)
  * tracepoint_probe_register -  Connect a probe to a tracepoint
  * @name: tracepoint name
  * @probe: probe handler
+ * @data: probe private data
  *
  * Returns 0 if ok, error value on error.
  * The probe address must at least be aligned on the architecture pointer size.
@@ -424,6 +425,7 @@ tracepoint_remove_probe(const char *name, void *probe, void *data)
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
  * @name: tracepoint name
  * @probe: probe function pointer
+ * @data: probe private data
  *
  * We do not need to call a synchronize_sched to make sure the probes have
  * finished running before doing a module unload, because the module unload
@@ -464,6 +466,7 @@ static void tracepoint_add_old_probes(void *old)
  * tracepoint_probe_register_noupdate -  register a probe but not connect
  * @name: tracepoint name
  * @probe: probe handler
+ * @data: probe private data
  *
  * caller must call tracepoint_probe_update_all()
  */
@@ -488,6 +491,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
  * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
  * @name: tracepoint name
  * @probe: probe function pointer
+ * @data: probe private data
  *
  * caller must call tracepoint_probe_update_all()
  */
-- 
cgit v1.3-14-g43fede


From 3bbc8db341773eb6aa5576eaabca4e95170fbe34 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Mon, 10 Mar 2014 21:04:58 -0400
Subject: tracepoints: API doc update to tracepoint_probe_register() return
 value

Describe the return values of tracepoint_probe_register(), including
-ENODEV added by commit:

Author: Steven Rostedt <rostedt@goodmis.org>

    tracing: Warn if a tracepoint is not set via debugfs

Link: http://lkml.kernel.org/r/1394499898-1537-2-git-send-email-mathieu.desnoyers@efficios.com

CC: Ingo Molnar <mingo@kernel.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/tracepoint.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index a4f629da3011..e2a58a22b0f4 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -378,7 +378,17 @@ tracepoint_add_probe(const char *name, void *probe, void *data)
  * @probe: probe handler
  * @data: probe private data
  *
- * Returns 0 if ok, error value on error.
+ * Returns:
+ * - 0 if the probe was successfully registered, and tracepoint
+ *   callsites are currently loaded for that probe,
+ * - -ENODEV if the probe was successfully registered, but no tracepoint
+ *   callsite is currently loaded for that probe,
+ * - other negative error value on error.
+ *
+ * When tracepoint_probe_register() returns either 0 or -ENODEV,
+ * parameters @name, @probe, and @data may be used by the tracepoint
+ * infrastructure until the probe is unregistered.
+ *
  * The probe address must at least be aligned on the architecture pointer size.
  */
 int tracepoint_probe_register(const char *name, void *probe, void *data)
-- 
cgit v1.3-14-g43fede


From d88471cb8b17a72b1edf5ab62e1704d78373c066 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Wed, 9 Jan 2013 18:09:20 -0500
Subject: ftrace: Constify ftrace_text_reserved

Link: http://lkml.kernel.org/r/1357772960-4436-5-git-send-email-sasha.levin@oracle.com

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 4 ++--
 kernel/trace/ftrace.c  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 20da0561b868..9212b017bc72 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -299,7 +299,7 @@ extern void
 unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
 extern void unregister_ftrace_function_probe_all(char *glob);
 
-extern int ftrace_text_reserved(void *start, void *end);
+extern int ftrace_text_reserved(const void *start, const void *end);
 
 extern int ftrace_nr_registered_ops(void);
 
@@ -552,7 +552,7 @@ static inline __init int unregister_ftrace_command(char *cmd_name)
 {
 	return -EINVAL;
 }
-static inline int ftrace_text_reserved(void *start, void *end)
+static inline int ftrace_text_reserved(const void *start, const void *end)
 {
 	return 0;
 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b4531b228180..1fd4b9479210 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1555,7 +1555,7 @@ unsigned long ftrace_location(unsigned long ip)
  * the function tracer. It checks the ftrace internal tables to
  * determine if the address belongs or not.
  */
-int ftrace_text_reserved(void *start, void *end)
+int ftrace_text_reserved(const void *start, const void *end)
 {
 	unsigned long ret;
 
-- 
cgit v1.3-14-g43fede


From 383afd0971538b3d77532a56404b24cfe967b5dd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Tue, 11 Mar 2014 19:24:20 -0400
Subject: sched: Fix broken setscheduler()

I decided to run my tests on linux-next, and my wakeup_rt tracer was
broken. After running a bisect, I found that the problem commit was:

   linux-next commit c365c292d059
   "sched: Consider pi boosting in setscheduler()"

And the reason the wake_rt tracer test was failing, was because it had
no RT task to trace. I first noticed this when running with
sched_switch event and saw that my RT task still had normal SCHED_OTHER
priority. Looking at the problem commit, I found:

 -       p->normal_prio = normal_prio(p);
 -       p->prio = rt_mutex_getprio(p);

With no

 +       p->normal_prio = normal_prio(p);
 +       p->prio = rt_mutex_getprio(p);

Reading what the commit is suppose to do, I realize that the p->prio
can't be set if the task is boosted with a higher prio, but the
p->normal_prio still needs to be set regardless, otherwise, when the
task is deboosted, it wont get the new priority.

The p->prio has to be set before "check_class_changed()" is called,
otherwise the class wont be changed.

Also added fix to newprio to include a check for deadline policy that
was missing. This change was suggested by Juri Lelli.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: SebastianAndrzej Siewior <bigeasy@linutronix.de>
Cc: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140306120438.638bfe94@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9e126a21c5c7..ae365aaa8181 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3195,6 +3195,7 @@ static void __setscheduler_params(struct task_struct *p,
 	 * getparam()/getattr() don't report silly values for !rt tasks.
 	 */
 	p->rt_priority = attr->sched_priority;
+	p->normal_prio = normal_prio(p);
 	set_load_weight(p);
 }
 
@@ -3204,6 +3205,12 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
 {
 	__setscheduler_params(p, attr);
 
+	/*
+	 * If we get here, there was no pi waiters boosting the
+	 * task. It is safe to use the normal prio.
+	 */
+	p->prio = normal_prio(p);
+
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
 	else if (rt_prio(p->prio))
@@ -3262,7 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
 				bool user)
 {
-	int newprio = MAX_RT_PRIO - 1 - attr->sched_priority;
+	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	int policy = attr->sched_policy;
 	unsigned long flags;
-- 
cgit v1.3-14-g43fede


From a2cd42601b474b957e1a5fe3692bcf7f9363bd51 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 11 Mar 2014 17:26:06 +0100
Subject: sched: Remove double calculation in fix_small_imbalance()

The tmp value has been already calculated in:

  scaled_busy_load_per_task =
		(busiest->load_per_task * SCHED_POWER_SCALE) /
		busiest->group_power;

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1394555166-22894-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f1eedae1e83e..b301918ed510 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6061,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 	pwr_now /= SCHED_POWER_SCALE;
 
 	/* Amount of load we'd subtract */
-	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-		busiest->group_power;
-	if (busiest->avg_load > tmp) {
+	if (busiest->avg_load > scaled_busy_load_per_task) {
 		pwr_move += busiest->group_power *
 			    min(busiest->load_per_task,
-				busiest->avg_load - tmp);
+				busiest->avg_load - scaled_busy_load_per_task);
 	}
 
 	/* Amount of load we'd add */
-- 
cgit v1.3-14-g43fede


From 6037dd1a49f95092824fa8ba75c717ff7805e317 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@linaro.org>
Date: Wed, 12 Mar 2014 14:51:51 +0800
Subject: sched: Clean up the task_hot() function

task_hot() doesn't need the 'sched_domain' parameter, so remove it.

Signed-off-by: Alex Shi <alex.shi@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1394607111-1904-1-git-send-email-alex.shi@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b301918ed510..7e9bd0b1fa9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5037,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
  * Is this task likely cache-hot:
  */
 static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now)
 {
 	s64 delta;
 
@@ -5198,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 	 * 2) task is cache cold, or
 	 * 3) too many balance attempts have failed.
 	 */
-	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
 	if (!tsk_cache_hot)
 		tsk_cache_hot = migrate_degrades_locality(p, env);
 
-- 
cgit v1.3-14-g43fede


From 6f008e72cd111a119b5d8de8c5438d892aae99eb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Mar 2014 13:24:42 +0100
Subject: locking/mutex: Fix debug checks

OK, so commit:

  1d8fe7dc8078 ("locking/mutexes: Unlock the mutex without the wait_lock")

generates this boot warning when CONFIG_DEBUG_MUTEXES=y:

  WARNING: CPU: 0 PID: 139 at /usr/src/linux-2.6/kernel/locking/mutex-debug.c:82 debug_mutex_unlock+0x155/0x180() DEBUG_LOCKS_WARN_ON(lock->owner != current)

And that makes sense, because as soon as we release the lock a
new owner can come in...

One would think that !__mutex_slowpath_needs_to_unlock()
implementations suffer the same, but for DEBUG we fall back to
mutex-null.h which has an unconditional 1 for that.

The mutex debug code requires the mutex to be unlocked after
doing the debug checks, otherwise it can find inconsistent
state.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: jason.low2@hp.com
Link: http://lkml.kernel.org/r/20140312122442.GB27965@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/mutex-debug.c | 6 ++++++
 kernel/locking/mutex.c       | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e77..e1191c996c59 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
 
 	DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
 	mutex_clear_owner(lock);
+
+	/*
+	 * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+	 * mutexes so that we can do it here after we've verified state.
+	 */
+	atomic_set(&lock->count, 1);
 }
 
 void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 02c61a9c8906..14fe72cc8ce7 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -34,6 +34,13 @@
 #ifdef CONFIG_DEBUG_MUTEXES
 # include "mutex-debug.h"
 # include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define  __mutex_slowpath_needs_to_unlock()	0
 #else
 # include "mutex.h"
 # include <asm/mutex.h>
-- 
cgit v1.3-14-g43fede


From c1bacbae8192dd2a9ebadd22d793b68054f6c6e5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 8 Mar 2014 08:59:58 +0100
Subject: genirq: Provide irq_request/release_resources chip callbacks

For certain irq types, e.g. gpios, it's necessary to request resources
before starting up the irq.

This might fail so we cannot use the irq_startup() callback because we
might call the irq_set_type() callback before that which does not make
sense when the resource is not available. Calling irq_startup() before
irq_set_type() can lead to spurious interrupts which is not desired
either.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jean-Jacques Hiblot <jjhiblot@traphandler.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1403080857160.18573@ionos.tec.linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h |  6 ++++++
 kernel/irq/manage.c | 28 +++++++++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7dc10036eff5..e675971bdc3f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -303,6 +303,10 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_pm_shutdown:	function called from core code on shutdown once per chip
  * @irq_calc_mask:	Optional function to set irq_data.mask for special cases
  * @irq_print_chip:	optional to print special chip info in show_interrupts
+ * @irq_request_resources:	optional to request resources before calling
+ *				any other callback related to this irq
+ * @irq_release_resources:	optional to release resources acquired with
+ *				irq_request_resources
  * @flags:		chip specific flags
  */
 struct irq_chip {
@@ -336,6 +340,8 @@ struct irq_chip {
 	void		(*irq_calc_mask)(struct irq_data *data);
 
 	void		(*irq_print_chip)(struct irq_data *data, struct seq_file *p);
+	int		(*irq_request_resources)(struct irq_data *data);
+	void		(*irq_release_resources)(struct irq_data *data);
 
 	unsigned long	flags;
 };
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index d3bf660cb57f..5d35cbe22896 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -896,6 +896,23 @@ static void irq_setup_forced_threading(struct irqaction *new)
 	}
 }
 
+static int irq_request_resources(struct irq_desc *desc)
+{
+	struct irq_data *d = &desc->irq_data;
+	struct irq_chip *c = d->chip;
+
+	return c->irq_request_resources ? c->irq_request_resources(d) : 0;
+}
+
+static void irq_release_resources(struct irq_desc *desc)
+{
+	struct irq_data *d = &desc->irq_data;
+	struct irq_chip *c = d->chip;
+
+	if (c->irq_release_resources)
+		c->irq_release_resources(d);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -1091,6 +1108,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	}
 
 	if (!shared) {
+		ret = irq_request_resources(desc);
+		if (ret) {
+			pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+			       new->name, irq, desc->irq_data.chip->name);
+			goto out_mask;
+		}
+
 		init_waitqueue_head(&desc->wait_for_threads);
 
 		/* Setup the type (level, edge polarity) if configured: */
@@ -1261,8 +1285,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	*action_ptr = action->next;
 
 	/* If this was the last handler, shut down the IRQ line: */
-	if (!desc->action)
+	if (!desc->action) {
 		irq_shutdown(desc);
+		irq_release_resources(desc);
+	}
 
 #ifdef CONFIG_SMP
 	/* make sure affinity_hint is cleaned up */
-- 
cgit v1.3-14-g43fede


From 27bba4d6bb3779a6678b31f9c9b9c1553c63fa95 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Mon, 3 Feb 2014 11:13:13 +1030
Subject: module: use pr_cont

When dumping loaded modules, we print them one by one in separate
printks. Let's use pr_cont as they are continuation prints.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..efa1e6031950 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3809,12 +3809,12 @@ void print_modules(void)
 	list_for_each_entry_rcu(mod, &modules, list) {
 		if (mod->state == MODULE_STATE_UNFORMED)
 			continue;
-		printk(" %s%s", mod->name, module_flags(mod, buf));
+		pr_cont(" %s%s", mod->name, module_flags(mod, buf));
 	}
 	preempt_enable();
 	if (last_unloaded_module[0])
-		printk(" [last unloaded: %s]", last_unloaded_module);
-	printk("\n");
+		pr_cont(" [last unloaded: %s]", last_unloaded_module);
+	pr_cont("\n");
 }
 
 #ifdef CONFIG_MODVERSIONS
-- 
cgit v1.3-14-g43fede


From 66cc69e34e86a231fbe68d8918c6119e3b7549a3 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 13 Mar 2014 12:11:30 +1030
Subject: Fix: module signature vs tracepoints: add new TAINT_UNSIGNED_MODULE

Users have reported being unable to trace non-signed modules loaded
within a kernel supporting module signature.

This is caused by tracepoint.c:tracepoint_module_coming() refusing to
take into account tracepoints sitting within force-loaded modules
(TAINT_FORCED_MODULE). The reason for this check, in the first place, is
that a force-loaded module may have a struct module incompatible with
the layout expected by the kernel, and can thus cause a kernel crash
upon forced load of that module on a kernel with CONFIG_TRACEPOINTS=y.

Tracepoints, however, specifically accept TAINT_OOT_MODULE and
TAINT_CRAP, since those modules do not lead to the "very likely system
crash" issue cited above for force-loaded modules.

With kernels having CONFIG_MODULE_SIG=y (signed modules), a non-signed
module is tainted re-using the TAINT_FORCED_MODULE taint flag.
Unfortunately, this means that Tracepoints treat that module as a
force-loaded module, and thus silently refuse to consider any tracepoint
within this module.

Since an unsigned module does not fit within the "very likely system
crash" category of tainting, add a new TAINT_UNSIGNED_MODULE taint flag
to specifically address this taint behavior, and accept those modules
within Tracepoints. We use the letter 'X' as a taint flag character for
a module being loaded that doesn't know how to sign its name (proposed
by Steven Rostedt).

Also add the missing 'O' entry to trace event show_module_flags() list
for the sake of completeness.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
NAKed-by: Ingo Molnar <mingo@redhat.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: David Howells <dhowells@redhat.com>
CC: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ABI/testing/sysfs-module | 1 +
 Documentation/module-signing.txt       | 3 ++-
 Documentation/oops-tracing.txt         | 3 +++
 Documentation/sysctl/kernel.txt        | 2 ++
 include/linux/kernel.h                 | 1 +
 include/trace/events/module.h          | 4 +++-
 kernel/module.c                        | 4 +++-
 kernel/panic.c                         | 2 ++
 kernel/tracepoint.c                    | 5 +++--
 9 files changed, 20 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index 47064c2b1f79..b9a29cdbaccb 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -49,3 +49,4 @@ Description:	Module taint flags:
 			O - out-of-tree module
 			F - force-loaded module
 			C - staging driver module
+			X - unsigned module
diff --git a/Documentation/module-signing.txt b/Documentation/module-signing.txt
index 2b40e04d3c49..b6af42e4d790 100644
--- a/Documentation/module-signing.txt
+++ b/Documentation/module-signing.txt
@@ -53,7 +53,8 @@ This has a number of options available:
 
      If this is off (ie. "permissive"), then modules for which the key is not
      available and modules that are unsigned are permitted, but the kernel will
-     be marked as being tainted.
+     be marked as being tainted, and the concerned modules will be marked as
+     tainted, shown with the character 'X'.
 
      If this is on (ie. "restrictive"), only modules that have a valid
      signature that can be verified by a public key in the kernel's possession
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 13032c0140d4..879abe289523 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -265,6 +265,9 @@ characters, each representing a particular tainted value.
 
  13: 'O' if an externally-built ("out-of-tree") module has been loaded.
 
+ 14: 'X' if an unsigned module has been loaded in a kernel supporting
+     module signature.
+
 The primary reason for the 'Tainted: ' string is to tell kernel
 debuggers if this is a clean kernel or if anything unusual has
 occurred.  Tainting is permanent: even if an offending module is
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index e55124e7c40c..8ebe1c047004 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -792,6 +792,8 @@ can be ORed together:
 1024 - A module from drivers/staging was loaded.
 2048 - The system is working around a severe firmware bug.
 4096 - An out-of-tree module has been loaded.
+8192 - An unsigned module has been loaded in a kernel supporting module
+       signature.
 
 ==============================================================
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 196d1ea86df0..471090093c67 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -469,6 +469,7 @@ extern enum system_states {
 #define TAINT_CRAP			10
 #define TAINT_FIRMWARE_WORKAROUND	11
 #define TAINT_OOT_MODULE		12
+#define TAINT_UNSIGNED_MODULE		13
 
 extern const char hex_asc[];
 #define hex_asc_lo(x)	hex_asc[((x) & 0x0f)]
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 161932737416..11fd51b413de 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -22,8 +22,10 @@ struct module;
 
 #define show_module_flags(flags) __print_flags(flags, "",	\
 	{ (1UL << TAINT_PROPRIETARY_MODULE),	"P" },		\
+	{ (1UL << TAINT_OOT_MODULE),		"O" },		\
 	{ (1UL << TAINT_FORCED_MODULE),		"F" },		\
-	{ (1UL << TAINT_CRAP),			"C" })
+	{ (1UL << TAINT_CRAP),			"C" },		\
+	{ (1UL << TAINT_UNSIGNED_MODULE),	"X" })
 
 TRACE_EVENT(module_load,
 
diff --git a/kernel/module.c b/kernel/module.c
index efa1e6031950..c1acb0c5b637 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1013,6 +1013,8 @@ static size_t module_flags_taint(struct module *mod, char *buf)
 		buf[l++] = 'F';
 	if (mod->taints & (1 << TAINT_CRAP))
 		buf[l++] = 'C';
+	if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
+		buf[l++] = 'X';
 	/*
 	 * TAINT_FORCED_RMMOD: could be added.
 	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -3214,7 +3216,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
 		pr_notice_once("%s: module verification failed: signature "
 			       "and/or  required key missing - tainting "
 			       "kernel\n", mod->name);
-		add_taint_module(mod, TAINT_FORCED_MODULE, LOCKDEP_STILL_OK);
+		add_taint_module(mod, TAINT_UNSIGNED_MODULE, LOCKDEP_STILL_OK);
 	}
 #endif
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..0e25fe10871e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -210,6 +210,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_CRAP,			'C', ' ' },
 	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
 	{ TAINT_OOT_MODULE,		'O', ' ' },
+	{ TAINT_UNSIGNED_MODULE,	'X', ' ' },
 };
 
 /**
@@ -228,6 +229,7 @@ static const struct tnt tnts[] = {
  *  'C' - modules from drivers/staging are loaded.
  *  'I' - Working around severe firmware bug.
  *  'O' - Out-of-tree module has been loaded.
+ *  'X' - Unsigned module has been loaded.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 031cc5655a51..3cdbed1fbdc7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -633,7 +633,8 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 #ifdef CONFIG_MODULES
 bool trace_module_has_bad_taint(struct module *mod)
 {
-	return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+	return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP) |
+			       (1 << TAINT_UNSIGNED_MODULE));
 }
 
 static int tracepoint_module_coming(struct module *mod)
@@ -644,7 +645,7 @@ static int tracepoint_module_coming(struct module *mod)
 	/*
 	 * We skip modules that taint the kernel, especially those with different
 	 * module headers (for forced load), to make sure we don't cause a crash.
-	 * Staging and out-of-tree GPL modules are fine.
+	 * Staging, out-of-tree, and unsigned GPL modules are fine.
 	 */
 	if (trace_module_has_bad_taint(mod))
 		return 0;
-- 
cgit v1.3-14-g43fede


From dee08a72deefac251267ed2717717596aa8b6818 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 Mar 2014 17:02:22 +0100
Subject: cputime: Fix jiffies based cputime assumption on steal accounting

The steal guest time accounting code assumes that cputime_t is based on
jiffies. So when CONFIG_NO_HZ_FULL=y, which implies that cputime_t
is based on nsecs, steal_account_process_tick() passes the delta in
jiffies to account_steal_time() which then accounts it as if it's a
value in nsecs.

As a result, accounting 1 second of steal time (with HZ=100 that would
be 100 jiffies) is spuriously accounted as 100 nsecs.

As such /proc/stat may report 0 values of steal time even when two
guests have run concurrently for a few seconds on the same host and
same CPU.

In order to fix this, lets convert the nsecs based steal delta to
cputime instead of jiffies by using the right conversion API.

Given that the steal time is stored in cputime_t and this type can have
a smaller granularity than nsecs, we only account the rounded converted
value and leave the remaining nsecs for the next deltas.

Reported-by: Huiqingding <huding@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/sched/cputime.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..c91b09770ebd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -258,16 +258,22 @@ static __always_inline bool steal_account_process_tick(void)
 {
 #ifdef CONFIG_PARAVIRT
 	if (static_key_false(&paravirt_steal_enabled)) {
-		u64 steal, st = 0;
+		u64 steal;
+		cputime_t steal_ct;
 
 		steal = paravirt_steal_clock(smp_processor_id());
 		steal -= this_rq()->prev_steal_time;
 
-		st = steal_ticks(steal);
-		this_rq()->prev_steal_time += st * TICK_NSEC;
+		/*
+		 * cputime_t may be less precise than nsecs (eg: if it's
+		 * based on jiffies). Lets cast the result to cputime
+		 * granularity and account the rest on the next rounds.
+		 */
+		steal_ct = nsecs_to_cputime(steal);
+		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
 
-		account_steal_time(st);
-		return st;
+		account_steal_time(steal_ct);
+		return steal_ct;
 	}
 #endif
 	return false;
-- 
cgit v1.3-14-g43fede


From 300a9d887ea221f344962506f724e02101bacc08 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 5 Mar 2014 17:05:57 +0100
Subject: sched: Remove needless round trip nsecs <-> tick conversion of steal
 time

When update_rq_clock_task() accounts the pending steal time for a task,
it converts the steal delta from nsecs to tick then from tick to nsecs.

There is no apparent good reason for doing that though because both
the task clock and the prev steal delta are u64 and store values
in nsecs.

So lets remove the needless conversion.

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/sched/core.c  |  6 ------
 kernel/sched/sched.h | 10 ----------
 2 files changed, 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..b14a188af898 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -823,19 +823,13 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
-		u64 st;
-
 		steal = paravirt_steal_clock(cpu_of(rq));
 		steal -= rq->prev_steal_time_rq;
 
 		if (unlikely(steal > delta))
 			steal = delta;
 
-		st = steal_ticks(steal);
-		steal = st * TICK_NSEC;
-
 		rq->prev_steal_time_rq += steal;
-
 		delta -= steal;
 	}
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..5ec991010122 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1214,16 +1214,6 @@ extern void update_idle_cpu_load(struct rq *this_rq);
 
 extern void init_task_runnable_average(struct task_struct *p);
 
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
-	if (unlikely(steal > NSEC_PER_SEC))
-		return div_u64(steal, TICK_NSEC);
-
-	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
 static inline void inc_nr_running(struct rq *rq)
 {
 	rq->nr_running++;
-- 
cgit v1.3-14-g43fede


From 89f8b33ca1ea881d1d84542282cb85d07d02e78d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Thu, 13 Mar 2014 09:38:42 -0600
Subject: block: remove old blk_iopoll_enabled variable

This was a debugging measure to toggle enabled/disabled
when testing. But for real production setups, it's not
safe to toggle this setting without either reloading
drivers of quiescing IO first. Neither of which the toggle
enforces.

Additionally, it makes drivers deal with the conditional
state.

Remove it completely. It's up to the driver whether iopoll
is enabled or not.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-iopoll.c              |   3 -
 drivers/scsi/be2iscsi/be_main.c | 206 ++++++++++++----------------------------
 drivers/scsi/ipr.c              |  15 +--
 include/linux/blk-iopoll.h      |   2 -
 kernel/sysctl.c                 |  12 ---
 5 files changed, 68 insertions(+), 170 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-iopoll.c b/block/blk-iopoll.c
index 1855bf51edb0..c11d24e379e2 100644
--- a/block/blk-iopoll.c
+++ b/block/blk-iopoll.c
@@ -14,9 +14,6 @@
 
 #include "blk.h"
 
-int blk_iopoll_enabled = 1;
-EXPORT_SYMBOL(blk_iopoll_enabled);
-
 static unsigned int blk_iopoll_budget __read_mostly = 256;
 
 static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 1f375051483a..a929c3c9aedc 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -873,7 +873,6 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
 	struct be_queue_info *cq;
 	unsigned int num_eq_processed;
 	struct be_eq_obj *pbe_eq;
-	unsigned long flags;
 
 	pbe_eq = dev_id;
 	eq = &pbe_eq->q;
@@ -882,31 +881,15 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id)
 
 	phba = pbe_eq->phba;
 	num_eq_processed = 0;
-	if (blk_iopoll_enabled) {
-		while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
-					& EQE_VALID_MASK) {
-			if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-				blk_iopoll_sched(&pbe_eq->iopoll);
-
-			AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
-			queue_tail_inc(eq);
-			eqe = queue_tail_node(eq);
-			num_eq_processed++;
-		}
-	} else {
-		while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
-						& EQE_VALID_MASK) {
-			spin_lock_irqsave(&phba->isr_lock, flags);
-			pbe_eq->todo_cq = true;
-			spin_unlock_irqrestore(&phba->isr_lock, flags);
-			AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
-			queue_tail_inc(eq);
-			eqe = queue_tail_node(eq);
-			num_eq_processed++;
-		}
+	while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
+				& EQE_VALID_MASK) {
+		if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
+			blk_iopoll_sched(&pbe_eq->iopoll);
 
-		if (pbe_eq->todo_cq)
-			queue_work(phba->wq, &pbe_eq->work_cqs);
+		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
+		queue_tail_inc(eq);
+		eqe = queue_tail_node(eq);
+		num_eq_processed++;
 	}
 
 	if (num_eq_processed)
@@ -927,7 +910,6 @@ static irqreturn_t be_isr(int irq, void *dev_id)
 	struct hwi_context_memory *phwi_context;
 	struct be_eq_entry *eqe = NULL;
 	struct be_queue_info *eq;
-	struct be_queue_info *cq;
 	struct be_queue_info *mcc;
 	unsigned long flags, index;
 	unsigned int num_mcceq_processed, num_ioeq_processed;
@@ -953,72 +935,40 @@ static irqreturn_t be_isr(int irq, void *dev_id)
 
 	num_ioeq_processed = 0;
 	num_mcceq_processed = 0;
-	if (blk_iopoll_enabled) {
-		while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
-					& EQE_VALID_MASK) {
-			if (((eqe->dw[offsetof(struct amap_eq_entry,
-			     resource_id) / 32] &
-			     EQE_RESID_MASK) >> 16) == mcc->id) {
-				spin_lock_irqsave(&phba->isr_lock, flags);
-				pbe_eq->todo_mcc_cq = true;
-				spin_unlock_irqrestore(&phba->isr_lock, flags);
-				num_mcceq_processed++;
-			} else {
-				if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
-					blk_iopoll_sched(&pbe_eq->iopoll);
-				num_ioeq_processed++;
-			}
-			AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
-			queue_tail_inc(eq);
-			eqe = queue_tail_node(eq);
-		}
-		if (num_ioeq_processed || num_mcceq_processed) {
-			if (pbe_eq->todo_mcc_cq)
-				queue_work(phba->wq, &pbe_eq->work_cqs);
-
-			if ((num_mcceq_processed) && (!num_ioeq_processed))
-				hwi_ring_eq_db(phba, eq->id, 0,
-					      (num_ioeq_processed +
-					       num_mcceq_processed) , 1, 1);
-			else
-				hwi_ring_eq_db(phba, eq->id, 0,
-					       (num_ioeq_processed +
-						num_mcceq_processed), 0, 1);
-
-			return IRQ_HANDLED;
-		} else
-			return IRQ_NONE;
-	} else {
-		cq = &phwi_context->be_cq[0];
-		while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
-						& EQE_VALID_MASK) {
-
-			if (((eqe->dw[offsetof(struct amap_eq_entry,
-			     resource_id) / 32] &
-			     EQE_RESID_MASK) >> 16) != cq->id) {
-				spin_lock_irqsave(&phba->isr_lock, flags);
-				pbe_eq->todo_mcc_cq = true;
-				spin_unlock_irqrestore(&phba->isr_lock, flags);
-			} else {
-				spin_lock_irqsave(&phba->isr_lock, flags);
-				pbe_eq->todo_cq = true;
-				spin_unlock_irqrestore(&phba->isr_lock, flags);
-			}
-			AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
-			queue_tail_inc(eq);
-			eqe = queue_tail_node(eq);
+	while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32]
+				& EQE_VALID_MASK) {
+		if (((eqe->dw[offsetof(struct amap_eq_entry,
+		     resource_id) / 32] &
+		     EQE_RESID_MASK) >> 16) == mcc->id) {
+			spin_lock_irqsave(&phba->isr_lock, flags);
+			pbe_eq->todo_mcc_cq = true;
+			spin_unlock_irqrestore(&phba->isr_lock, flags);
+			num_mcceq_processed++;
+		} else {
+			if (!blk_iopoll_sched_prep(&pbe_eq->iopoll))
+				blk_iopoll_sched(&pbe_eq->iopoll);
 			num_ioeq_processed++;
 		}
-		if (pbe_eq->todo_cq || pbe_eq->todo_mcc_cq)
+		AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0);
+		queue_tail_inc(eq);
+		eqe = queue_tail_node(eq);
+	}
+	if (num_ioeq_processed || num_mcceq_processed) {
+		if (pbe_eq->todo_mcc_cq)
 			queue_work(phba->wq, &pbe_eq->work_cqs);
 
-		if (num_ioeq_processed) {
+		if ((num_mcceq_processed) && (!num_ioeq_processed))
 			hwi_ring_eq_db(phba, eq->id, 0,
-				       num_ioeq_processed, 1, 1);
-			return IRQ_HANDLED;
-		} else
-			return IRQ_NONE;
-	}
+				      (num_ioeq_processed +
+				       num_mcceq_processed) , 1, 1);
+		else
+			hwi_ring_eq_db(phba, eq->id, 0,
+				       (num_ioeq_processed +
+					num_mcceq_processed), 0, 1);
+
+		return IRQ_HANDLED;
+	} else
+		return IRQ_NONE;
 }
 
 static int beiscsi_init_irqs(struct beiscsi_hba *phba)
@@ -5216,11 +5166,10 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba,
 		}
 	pci_disable_msix(phba->pcidev);
 
-	if (blk_iopoll_enabled)
-		for (i = 0; i < phba->num_cpus; i++) {
-			pbe_eq = &phwi_context->be_eq[i];
-			blk_iopoll_disable(&pbe_eq->iopoll);
-		}
+	for (i = 0; i < phba->num_cpus; i++) {
+		pbe_eq = &phwi_context->be_eq[i];
+		blk_iopoll_disable(&pbe_eq->iopoll);
+	}
 
 	if (unload_state == BEISCSI_CLEAN_UNLOAD) {
 		destroy_workqueue(phba->wq);
@@ -5429,32 +5378,18 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev)
 	phwi_ctrlr = phba->phwi_ctrlr;
 	phwi_context = phwi_ctrlr->phwi_ctxt;
 
-	if (blk_iopoll_enabled) {
-		for (i = 0; i < phba->num_cpus; i++) {
-			pbe_eq = &phwi_context->be_eq[i];
-			blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
-					be_iopoll);
-			blk_iopoll_enable(&pbe_eq->iopoll);
-		}
-
-		i = (phba->msix_enabled) ? i : 0;
-		/* Work item for MCC handling */
+	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		INIT_WORK(&pbe_eq->work_cqs, beiscsi_process_all_cqs);
-	} else {
-		if (phba->msix_enabled) {
-			for (i = 0; i <= phba->num_cpus; i++) {
-				pbe_eq = &phwi_context->be_eq[i];
-				INIT_WORK(&pbe_eq->work_cqs,
-					  beiscsi_process_all_cqs);
-			}
-		} else {
-			pbe_eq = &phwi_context->be_eq[0];
-			INIT_WORK(&pbe_eq->work_cqs,
-				  beiscsi_process_all_cqs);
-		}
+		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+				be_iopoll);
+		blk_iopoll_enable(&pbe_eq->iopoll);
 	}
 
+	i = (phba->msix_enabled) ? i : 0;
+	/* Work item for MCC handling */
+	pbe_eq = &phwi_context->be_eq[i];
+	INIT_WORK(&pbe_eq->work_cqs, beiscsi_process_all_cqs);
+
 	ret = beiscsi_init_irqs(phba);
 	if (ret < 0) {
 		beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_INIT,
@@ -5614,32 +5549,18 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 	phwi_ctrlr = phba->phwi_ctrlr;
 	phwi_context = phwi_ctrlr->phwi_ctxt;
 
-	if (blk_iopoll_enabled) {
-		for (i = 0; i < phba->num_cpus; i++) {
-			pbe_eq = &phwi_context->be_eq[i];
-			blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
-					be_iopoll);
-			blk_iopoll_enable(&pbe_eq->iopoll);
-		}
-
-		i = (phba->msix_enabled) ? i : 0;
-		/* Work item for MCC handling */
+	for (i = 0; i < phba->num_cpus; i++) {
 		pbe_eq = &phwi_context->be_eq[i];
-		INIT_WORK(&pbe_eq->work_cqs, beiscsi_process_all_cqs);
-	} else {
-		if (phba->msix_enabled) {
-			for (i = 0; i <= phba->num_cpus; i++) {
-				pbe_eq = &phwi_context->be_eq[i];
-				INIT_WORK(&pbe_eq->work_cqs,
-					  beiscsi_process_all_cqs);
-			}
-		} else {
-				pbe_eq = &phwi_context->be_eq[0];
-				INIT_WORK(&pbe_eq->work_cqs,
-					  beiscsi_process_all_cqs);
-			}
+		blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget,
+				be_iopoll);
+		blk_iopoll_enable(&pbe_eq->iopoll);
 	}
 
+	i = (phba->msix_enabled) ? i : 0;
+	/* Work item for MCC handling */
+	pbe_eq = &phwi_context->be_eq[i];
+	INIT_WORK(&pbe_eq->work_cqs, beiscsi_process_all_cqs);
+
 	ret = beiscsi_init_irqs(phba);
 	if (ret < 0) {
 		beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_INIT,
@@ -5668,11 +5589,10 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 
 free_blkenbld:
 	destroy_workqueue(phba->wq);
-	if (blk_iopoll_enabled)
-		for (i = 0; i < phba->num_cpus; i++) {
-			pbe_eq = &phwi_context->be_eq[i];
-			blk_iopoll_disable(&pbe_eq->iopoll);
-		}
+	for (i = 0; i < phba->num_cpus; i++) {
+		pbe_eq = &phwi_context->be_eq[i];
+		blk_iopoll_disable(&pbe_eq->iopoll);
+	}
 free_twq:
 	beiscsi_clean_port(phba);
 	beiscsi_free_mem(phba);
diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index 3f5b56a99892..69470f5c0ac9 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3630,16 +3630,14 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev,
 		return strlen(buf);
 	}
 
-	if (blk_iopoll_enabled && ioa_cfg->iopoll_weight &&
-			ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
+	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
 			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
 	}
 
 	spin_lock_irqsave(shost->host_lock, lock_flags);
 	ioa_cfg->iopoll_weight = user_iopoll_weight;
-	if (blk_iopoll_enabled && ioa_cfg->iopoll_weight &&
-			ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
+	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
 			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
@@ -5484,8 +5482,7 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp)
 		return IRQ_NONE;
 	}
 
-	if (blk_iopoll_enabled && ioa_cfg->iopoll_weight &&
-			ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
+	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) ==
 		       hrrq->toggle_bit) {
 			if (!blk_iopoll_sched_prep(&hrrq->iopoll))
@@ -9859,8 +9856,7 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id)
 	ioa_cfg->host->max_channel = IPR_VSET_BUS;
 	ioa_cfg->iopoll_weight = ioa_cfg->chip_cfg->iopoll_weight;
 
-	if (blk_iopoll_enabled && ioa_cfg->iopoll_weight &&
-			ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
+	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		for (i = 1; i < ioa_cfg->hrrq_num; i++) {
 			blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll,
 					ioa_cfg->iopoll_weight, ipr_iopoll);
@@ -9889,8 +9885,7 @@ static void ipr_shutdown(struct pci_dev *pdev)
 	int i;
 
 	spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
-	if (blk_iopoll_enabled && ioa_cfg->iopoll_weight &&
-			ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
+	if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) {
 		ioa_cfg->iopoll_weight = 0;
 		for (i = 1; i < ioa_cfg->hrrq_num; i++)
 			blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll);
diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h
index 308734d3d4a2..77ae77c0b704 100644
--- a/include/linux/blk-iopoll.h
+++ b/include/linux/blk-iopoll.h
@@ -43,6 +43,4 @@ extern void __blk_iopoll_complete(struct blk_iopoll *);
 extern void blk_iopoll_enable(struct blk_iopoll *);
 extern void blk_iopoll_disable(struct blk_iopoll *);
 
-extern int blk_iopoll_enabled;
-
 #endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..ef0bf04e8649 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -112,9 +112,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
 #endif
-#ifdef CONFIG_BLOCK
-extern int blk_iopoll_enabled;
-#endif
 
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
@@ -1093,15 +1090,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-#endif
-#ifdef CONFIG_BLOCK
-	{
-		.procname	= "blk_iopoll",
-		.data		= &blk_iopoll_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
 #endif
 	{ }
 };
-- 
cgit v1.3-14-g43fede


From 328a4978df833249b099c9875738d7b72042ffe1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 13 Mar 2014 19:03:51 +0100
Subject: genirq: Add a new IRQCHIP_EOI_THREADED flag

The flag is necessary for interrupt chips which require an ACK/EOI
after the handler has run. In case of threaded handlers this needs to
happen after the threaded handler has completed before the unmask of
the interrupt.

The flag is only unseful in combination with the handle_fasteoi_irq
flow control handler.

It can be combined with the flag IRQCHIP_EOI_IF_HANDLED, so the EOI is
not issued when the interrupt is disabled or in progress.

Tested-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-sunxi@googlegroups.com
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: http://lkml.kernel.org/r/1394733834-26839-2-git-send-email-hdegoede@redhat.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h    |  2 ++
 kernel/irq/chip.c      | 48 ++++++++++++++++++++++++++++++++++++++++--------
 kernel/irq/internals.h |  1 +
 kernel/irq/manage.c    |  2 +-
 4 files changed, 44 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 67ace7aa7947..d278838908cb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -356,6 +356,7 @@ struct irq_chip {
  *				when irq enabled
  * IRQCHIP_SKIP_SET_WAKE:	Skip chip.irq_set_wake(), for this irq chip
  * IRQCHIP_ONESHOT_SAFE:	One shot does not require mask/unmask
+ * IRQCHIP_EOI_THREADED:	Chip requires eoi() on unmask in threaded mode
  */
 enum {
 	IRQCHIP_SET_TYPE_MASKED		= (1 <<  0),
@@ -364,6 +365,7 @@ enum {
 	IRQCHIP_ONOFFLINE_ENABLED	= (1 <<  3),
 	IRQCHIP_SKIP_SET_WAKE		= (1 <<  4),
 	IRQCHIP_ONESHOT_SAFE		= (1 <<  5),
+	IRQCHIP_EOI_THREADED		= (1 <<  6),
 };
 
 /* This include will go away once we isolated irq_desc usage to core code */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc04c166c54d..6397df2d6945 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -281,6 +281,19 @@ void unmask_irq(struct irq_desc *desc)
 	}
 }
 
+void unmask_threaded_irq(struct irq_desc *desc)
+{
+	struct irq_chip *chip = desc->irq_data.chip;
+
+	if (chip->flags & IRQCHIP_EOI_THREADED)
+		chip->irq_eoi(&desc->irq_data);
+
+	if (chip->irq_unmask) {
+		chip->irq_unmask(&desc->irq_data);
+		irq_state_clr_masked(desc);
+	}
+}
+
 /*
  *	handle_nested_irq - Handle a nested irq from a irq thread
  *	@irq:	the interrupt number
@@ -435,6 +448,27 @@ static inline void preflow_handler(struct irq_desc *desc)
 static inline void preflow_handler(struct irq_desc *desc) { }
 #endif
 
+static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
+{
+	if (!(desc->istate & IRQS_ONESHOT)) {
+		chip->irq_eoi(&desc->irq_data);
+		return;
+	}
+	/*
+	 * We need to unmask in the following cases:
+	 * - Oneshot irq which did not wake the thread (caused by a
+	 *   spurious interrupt or a primary handler handling it
+	 *   completely).
+	 */
+	if (!irqd_irq_disabled(&desc->irq_data) &&
+	    irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) {
+		chip->irq_eoi(&desc->irq_data);
+		unmask_irq(desc);
+	} else if (!(chip->flags & IRQCHIP_EOI_THREADED)) {
+		chip->irq_eoi(&desc->irq_data);
+	}
+}
+
 /**
  *	handle_fasteoi_irq - irq handler for transparent controllers
  *	@irq:	the interrupt number
@@ -448,6 +482,8 @@ static inline void preflow_handler(struct irq_desc *desc) { }
 void
 handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 {
+	struct irq_chip *chip = desc->irq_data.chip;
+
 	raw_spin_lock(&desc->lock);
 
 	if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
@@ -473,18 +509,14 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	preflow_handler(desc);
 	handle_irq_event(desc);
 
-	if (desc->istate & IRQS_ONESHOT)
-		cond_unmask_irq(desc);
+	cond_unmask_eoi_irq(desc, chip);
 
-out_eoi:
-	desc->irq_data.chip->irq_eoi(&desc->irq_data);
-out_unlock:
 	raw_spin_unlock(&desc->lock);
 	return;
 out:
-	if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
-		goto out_eoi;
-	goto out_unlock;
+	if (!(chip->flags & IRQCHIP_EOI_IF_HANDLED))
+		chip->irq_eoi(&desc->irq_data);
+	raw_spin_unlock(&desc->lock);
 }
 
 /**
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 17b671713d5f..ddf1ffeb79f1 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -74,6 +74,7 @@ extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
 extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
+extern void unmask_threaded_irq(struct irq_desc *desc);
 
 extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index de1a8ed29b40..2486a4c1a710 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -748,7 +748,7 @@ again:
 
 	if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
 	    irqd_irq_masked(&desc->irq_data))
-		unmask_irq(desc);
+		unmask_threaded_irq(desc);
 
 out_unlock:
 	raw_spin_unlock_irq(&desc->lock);
-- 
cgit v1.3-14-g43fede


From 09294e31b1779dda22f420c195994a0db54c9a92 Mon Sep 17 00:00:00 2001
From: "David A. Long" <dave.long@linaro.org>
Date: Fri, 7 Mar 2014 10:32:22 -0500
Subject: uprobes: Kconfig dependency fix

Suggested change from Oleg Nesterov. Fixes incomplete dependencies
for uprobes feature.

Signed-off-by: David A. Long <dave.long@linaro.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 arch/Kconfig         | 6 +-----
 kernel/trace/Kconfig | 1 +
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 80bbb8ccd0d1..97ff872c7acc 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -86,9 +86,7 @@ config KPROBES_ON_FTRACE
 	 optimize on top of function tracing.
 
 config UPROBES
-	bool "Transparent user-space probes (EXPERIMENTAL)"
-	depends on UPROBE_EVENT && PERF_EVENTS
-	default n
+	def_bool n
 	select PERCPU_RWSEM
 	help
 	  Uprobes is the user-space counterpart to kprobes: they
@@ -101,8 +99,6 @@ config UPROBES
 	    managed by the kernel and kept transparent to the probed
 	    application. )
 
-	  If in doubt, say "N".
-
 config HAVE_64BIT_ALIGNED_ACCESS
 	def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS
 	help
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 015f85aaca08..8639819f6cef 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -424,6 +424,7 @@ config UPROBE_EVENT
 	bool "Enable uprobes-based dynamic events"
 	depends on ARCH_SUPPORTS_UPROBES
 	depends on MMU
+	depends on PERF_EVENTS
 	select UPROBES
 	select PROBE_EVENTS
 	select TRACING
-- 
cgit v1.3-14-g43fede


From 6fe50a28ba6e5fafb4a549dea666dd15297dd8bd Mon Sep 17 00:00:00 2001
From: "David A. Long" <dave.long@linaro.org>
Date: Mon, 3 Feb 2014 14:25:49 -0500
Subject: uprobes: allow ignoring of probe hits

Allow arches to decided to ignore a probe hit.  ARM will use this to
only call handlers if the conditions to execute a conditionally executed
instruction are satisfied.

Signed-off-by: David A. Long <dave.long@linaro.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/uprobes.h | 1 +
 kernel/events/uprobes.c | 9 +++++++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index e32251e00e62..edff2b97b864 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -126,6 +126,7 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
+extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 307d87c0991a..04709b66369d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1804,6 +1804,11 @@ static bool handle_trampoline(struct pt_regs *regs)
 	return true;
 }
 
+bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
+{
+	return false;
+}
+
 /*
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1858,7 +1863,11 @@ static void handle_swbp(struct pt_regs *regs)
 	if (!get_utask())
 		goto out;
 
+	if (arch_uprobe_ignore(&uprobe->arch, regs))
+		goto out;
+
 	handler_chain(uprobe, regs);
+
 	if (can_skip_sstep(uprobe, regs))
 		goto out;
 
-- 
cgit v1.3-14-g43fede


From 3eb59ec64fc7a3f4576da23f811b39331b830ba2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 18 Mar 2014 17:02:36 +0800
Subject: cgroup: fix a failure path in create_css()

If online_css() fails, we should remove cgroup files belonging
to css->ss.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 105f273b6f86..0c753ddd223b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	err = percpu_ref_init(&css->refcnt, css_release);
 	if (err)
-		goto err_free;
+		goto err_free_css;
 
 	init_css(css, ss, cgrp);
 
 	err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
 	if (err)
-		goto err_free;
+		goto err_free_percpu_ref;
 
 	err = online_css(css);
 	if (err)
-		goto err_free;
+		goto err_clear_dir;
 
 	dget(cgrp->dentry);
 	css_get(css->parent);
@@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	return 0;
 
-err_free:
+err_clear_dir:
+	cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+err_free_percpu_ref:
 	percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
 	ss->css_free(css);
 	return err;
 }
-- 
cgit v1.3-14-g43fede


From d532676cc7329e1088702ccb0015942cc370b954 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Mar 2014 11:19:52 +0100
Subject: softirq: Add linux/irq.h to make it compile again

On Sparc and S390 the removal of irq.h from kernel_stat.h causes:

   kernel/softirq.c:774:9: error: 'NR_IRQS_LEGACY' undeclared

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/softirq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 490fcbb1dc5b..b50990a5bea0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -25,6 +25,7 @@
 #include <linux/smp.h>
 #include <linux/smpboot.h>
 #include <linux/tick.h>
+#include <linux/irq.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
-- 
cgit v1.3-14-g43fede


From bab5c790cc64adb1ede54b4077444375108ac8da Mon Sep 17 00:00:00 2001
From: Chema Gonzalez <chema@google.com>
Date: Thu, 13 Mar 2014 19:50:55 -0700
Subject: genirq: procfs: Make smp_affinity values go+r

Includes:
- /proc/irq/default_smp_affinity
- /proc/irq/*/affinity_hint
- /proc/irq/*/smp_affinity
- /proc/irq/*/smp_affinity_list

Users can distill the same information by reading /proc/interrupts.

Signed-off-by: Chema Gonzalez <chema@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Link: http://lkml.kernel.org/r/1394765455-1217-1-git-send-email-chema@google.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/proc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 36f6ee181b0c..ac1ba2f11032 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -324,15 +324,15 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 #ifdef CONFIG_SMP
 	/* create /proc/irq/<irq>/smp_affinity */
-	proc_create_data("smp_affinity", 0600, desc->dir,
+	proc_create_data("smp_affinity", 0644, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
 
 	/* create /proc/irq/<irq>/affinity_hint */
-	proc_create_data("affinity_hint", 0400, desc->dir,
+	proc_create_data("affinity_hint", 0444, desc->dir,
 			 &irq_affinity_hint_proc_fops, (void *)(long)irq);
 
 	/* create /proc/irq/<irq>/smp_affinity_list */
-	proc_create_data("smp_affinity_list", 0600, desc->dir,
+	proc_create_data("smp_affinity_list", 0644, desc->dir,
 			 &irq_affinity_list_proc_fops, (void *)(long)irq);
 
 	proc_create_data("node", 0444, desc->dir,
@@ -372,7 +372,7 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 static void register_default_affinity_proc(void)
 {
 #ifdef CONFIG_SMP
-	proc_create("irq/default_smp_affinity", 0600, NULL,
+	proc_create("irq/default_smp_affinity", 0644, NULL,
 		    &default_affinity_proc_fops);
 #endif
 }
-- 
cgit v1.3-14-g43fede


From 5d77381fd8aa631a8fda718c395da1319afb5d2d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:53 -0400
Subject: cgroup: relocate setting of CGRP_DEAD

In cgroup_destroy_locked(), move setting of CGRP_DEAD above
invocations of kill_css().  This doesn't make any visible behavior
difference now but will be used to inhibit manipulating controller
enable states of a dying cgroup on the unified hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 306ad0ed19ef..b604c7e0cfc6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3867,6 +3867,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	if (!empty)
 		return -EBUSY;
 
+	/*
+	 * Mark @cgrp dead.  This prevents further task migration and child
+	 * creation by disabling cgroup_lock_live_group().  Note that
+	 * CGRP_DEAD assertion is depended upon by css_next_child() to
+	 * resume iteration after dropping RCU read lock.  See
+	 * css_next_child() for details.
+	 */
+	set_bit(CGRP_DEAD, &cgrp->flags);
+
 	/*
 	 * Initiate massacre of all css's.  cgroup_destroy_css_killed()
 	 * will be invoked to perform the rest of destruction once the
@@ -3878,15 +3887,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 		kill_css(css);
 	mutex_lock(&cgroup_mutex);
 
-	/*
-	 * Mark @cgrp dead.  This prevents further task migration and child
-	 * creation by disabling cgroup_lock_live_group().  Note that
-	 * CGRP_DEAD assertion is depended upon by css_next_child() to
-	 * resume iteration after dropping RCU read lock.  See
-	 * css_next_child() for details.
-	 */
-	set_bit(CGRP_DEAD, &cgrp->flags);
-
 	/* CGRP_DEAD is set, remove from ->release_list for the last time */
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
-- 
cgit v1.3-14-g43fede


From 172a2c0685ff3bc0b7a611b308aac0694de34594 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:53 -0400
Subject: cgroup: reorganize cgroup bootstrapping

* Fields of init_css_set and css_set_count are now set using
  initializer instead of programmatically from cgroup_init_early().

* init_cgroup_root() now also takes @opts and performs the optional
  part of initialization too.  The leftover part of
  cgroup_root_from_opts() is collapsed into its only caller -
  cgroup_mount().

* Initialization of cgroup_root_count and linking of init_css_set are
  moved from cgroup_init_early() to to cgroup_init().  None of the
  early_init users depends on init_css_set being linked.

* Subsystem initializations are moved after dummy hierarchy init and
  init_css_set linking.

These changes reorganize the bootstrap logic so that the dummy
hierarchy can share the usual hierarchy init path and be made more
normal.  These changes don't make noticeable behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 100 +++++++++++++++++++++++++++-----------------------------
 1 file changed, 49 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b604c7e0cfc6..e66b9ee5ecc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -338,16 +338,24 @@ struct cgrp_cset_link {
 	struct list_head	cgrp_link;
 };
 
-/* The default css_set - used by init and its children prior to any
+/*
+ * The default css_set - used by init and its children prior to any
  * hierarchies being mounted. It contains a pointer to the root state
  * for each subsystem. Also used to anchor the list of css_sets. Not
  * reference-counted, to improve performance when child cgroups
  * haven't been created.
  */
+static struct css_set init_css_set = {
+	.refcount		= ATOMIC_INIT(1),
+	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
+	.tasks			= LIST_HEAD_INIT(init_css_set.tasks),
+	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
+	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
+	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+};
 
-static struct css_set init_css_set;
 static struct cgrp_cset_link init_cgrp_cset_link;
-static int css_set_count;
+static int css_set_count	= 1;	/* 1 for init_css_set */
 
 /*
  * hash table for cgroup groups. This improves the performance to find
@@ -1352,7 +1360,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dummy_css.cgroup = cgrp;
 }
 
-static void init_cgroup_root(struct cgroupfs_root *root)
+static void init_cgroup_root(struct cgroupfs_root *root,
+			     struct cgroup_sb_opts *opts)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
 
@@ -1361,20 +1370,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	cgrp->root = root;
 	init_cgroup_housekeeping(cgrp);
 	idr_init(&root->cgroup_idr);
-}
-
-static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
-{
-	struct cgroupfs_root *root;
-
-	if (!opts->subsys_mask && !opts->none)
-		return ERR_PTR(-EINVAL);
-
-	root = kzalloc(sizeof(*root), GFP_KERNEL);
-	if (!root)
-		return ERR_PTR(-ENOMEM);
-
-	init_cgroup_root(root);
 
 	root->flags = opts->flags;
 	if (opts->release_agent)
@@ -1383,7 +1378,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
 		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
-	return root;
 }
 
 static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
@@ -1548,13 +1542,24 @@ retry:
 		goto out_unlock;
 	}
 
-	/* no such thing, create a new one */
-	root = cgroup_root_from_opts(&opts);
-	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
+	/*
+	 * No such thing, create a new one.  name= matching without subsys
+	 * specification is allowed for already existing hierarchies but we
+	 * can't create new one without subsys specification.
+	 */
+	if (!opts.subsys_mask && !opts.none) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root) {
+		ret = -ENOMEM;
 		goto out_unlock;
 	}
 
+	init_cgroup_root(root, &opts);
+
 	ret = cgroup_setup_root(root, opts.subsys_mask);
 	if (ret)
 		cgroup_free_root(root);
@@ -4030,26 +4035,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  */
 int __init cgroup_init_early(void)
 {
+	static struct cgroup_sb_opts __initdata opts = { };
 	struct cgroup_subsys *ss;
 	int i;
 
-	atomic_set(&init_css_set.refcount, 1);
-	INIT_LIST_HEAD(&init_css_set.cgrp_links);
-	INIT_LIST_HEAD(&init_css_set.tasks);
-	INIT_LIST_HEAD(&init_css_set.mg_tasks);
-	INIT_LIST_HEAD(&init_css_set.mg_preload_node);
-	INIT_LIST_HEAD(&init_css_set.mg_node);
-	INIT_HLIST_NODE(&init_css_set.hlist);
-	css_set_count = 1;
-	init_cgroup_root(&cgroup_dummy_root);
-	cgroup_root_count = 1;
+	init_cgroup_root(&cgroup_dummy_root, &opts);
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 
-	init_cgrp_cset_link.cset = &init_css_set;
-	init_cgrp_cset_link.cgrp = cgroup_dummy_top;
-	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
-	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
-
 	for_each_subsys(ss, i) {
 		WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
 		     "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
@@ -4077,22 +4069,10 @@ int __init cgroup_init(void)
 {
 	struct cgroup_subsys *ss;
 	unsigned long key;
-	int i, err;
+	int ssid, err;
 
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 
-	for_each_subsys(ss, i) {
-		if (!ss->early_init)
-			cgroup_init_subsys(ss);
-
-		/*
-		 * cftype registration needs kmalloc and can't be done
-		 * during early_init.  Register base cftypes separately.
-		 */
-		if (ss->base_cftypes)
-			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
-	}
-
 	/* allocate id for the dummy hierarchy */
 	mutex_lock(&cgroup_mutex);
 
@@ -4106,8 +4086,26 @@ int __init cgroup_init(void)
 			0, 1, GFP_KERNEL);
 	BUG_ON(err < 0);
 
+	cgroup_root_count = 1;
+	init_cgrp_cset_link.cset = &init_css_set;
+	init_cgrp_cset_link.cgrp = cgroup_dummy_top;
+	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
+	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
+
 	mutex_unlock(&cgroup_mutex);
 
+	for_each_subsys(ss, ssid) {
+		if (!ss->early_init)
+			cgroup_init_subsys(ss);
+
+		/*
+		 * cftype registration needs kmalloc and can't be done
+		 * during early_init.  Register base cftypes separately.
+		 */
+		if (ss->base_cftypes)
+			WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+	}
+
 	cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
 	if (!cgroup_kobj)
 		return -ENOMEM;
-- 
cgit v1.3-14-g43fede


From 985ed670144c25058f235276f69d687de1b7c7ba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:53 -0400
Subject: cgroup: use cgroup_setup_root() to initialize cgroup_dummy_root

cgroup_dummy_root is used to host controllers which aren't attached to
any other hierarchy.  The root is minimally set up during kernfs
bootstrap and didn't go through full hierarchy initialization.  We're
planning to use cgroup_dummy_root for the default unified hierarchy
and thus want it to be fully functional.

Replace the special initialization, which was collected into
cgroup_init() by the previous patch, with an invocation of
cgroup_setup_root().  This simplifies the init path and makes
cgroup_dummy_root a full hierarchy with its own kernfs_root and all.

As this puts the dummy hierarchy on the cgroup_roots list, rename
for_each_active_root() to for_each_root() and update its users to skip
the dummy root for now.

This patch doesn't cause any userland visible behavior changes at this
point.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e66b9ee5ecc1..78017f52c69b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -289,8 +289,8 @@ static int notify_on_release(const struct cgroup *cgrp)
 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 
-/* iterate across the active hierarchies */
-#define for_each_active_root(root)					\
+/* iterate across the hierarchies */
+#define for_each_root(root)						\
 	list_for_each_entry((root), &cgroup_roots, root_list)
 
 /**
@@ -354,7 +354,6 @@ static struct css_set init_css_set = {
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
 };
 
-static struct cgrp_cset_link init_cgrp_cset_link;
 static int css_set_count	= 1;	/* 1 for init_css_set */
 
 /*
@@ -693,14 +692,13 @@ static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 	return top_cgrp->root;
 }
 
-static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
+static int cgroup_init_root_id(struct cgroupfs_root *root)
 {
 	int id;
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
-			      GFP_KERNEL);
+	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
 	if (id < 0)
 		return id;
 
@@ -1405,8 +1403,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 	if (ret)
 		goto out;
 
-	/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
-	ret = cgroup_init_root_id(root, 2, 0);
+	ret = cgroup_init_root_id(root);
 	if (ret)
 		goto out;
 
@@ -1486,9 +1483,12 @@ retry:
 		goto out_unlock;
 
 	/* look for a matching existing root */
-	for_each_active_root(root) {
+	for_each_root(root) {
 		bool name_match = false;
 
+		if (root == &cgroup_dummy_root)
+			continue;
+
 		/*
 		 * If we asked for a name then it must match.  Also, if
 		 * name matches but sybsys_mask doesn't, we should fail.
@@ -2106,9 +2106,12 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	int retval = 0;
 
 	mutex_lock(&cgroup_mutex);
-	for_each_active_root(root) {
+	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
+		if (root == &cgroup_dummy_root)
+			continue;
+
 		down_read(&css_set_rwsem);
 		from_cgrp = task_cgroup_from_root(from, root);
 		up_read(&css_set_rwsem);
@@ -4073,26 +4076,17 @@ int __init cgroup_init(void)
 
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 
-	/* allocate id for the dummy hierarchy */
+	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
 	/* Add init_css_set to the hash table */
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
 
-	BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1));
-
-	err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top,
-			0, 1, GFP_KERNEL);
-	BUG_ON(err < 0);
-
-	cgroup_root_count = 1;
-	init_cgrp_cset_link.cset = &init_css_set;
-	init_cgrp_cset_link.cgrp = cgroup_dummy_top;
-	list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links);
-	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
+	BUG_ON(cgroup_setup_root(&cgroup_dummy_root, 0));
 
 	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&cgroup_tree_mutex);
 
 	for_each_subsys(ss, ssid) {
 		if (!ss->early_init)
@@ -4176,11 +4170,14 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	mutex_lock(&cgroup_mutex);
 	down_read(&css_set_rwsem);
 
-	for_each_active_root(root) {
+	for_each_root(root) {
 		struct cgroup_subsys *ss;
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 
+		if (root == &cgroup_dummy_root)
+			continue;
+
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-- 
cgit v1.3-14-g43fede


From 5df3603229e520442502ff7fc5715c77bbf61912 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: treat cgroup_dummy_root as an equivalent hierarchy during
 rebinding

Currently, while rebinding, cgroup_dummy_root serves as the anchor
point.  In addition to the target root, rebind_subsystems() takes
@added_mask and @removed_mask.  The subsystems specified in the former
are expected to be on the dummy root and then moved to the target
root.  The ones in the latter are moved from non-dummy root to dummy.
Now that the dummy root is a fully functional one and we're planning
to use it for the default unified hierarchy, this level of distinction
between dummy and non-dummy roots is quite awkward.

This patch updates rebind_subsystems() to take the target root and one
subsystem mask and move the specified subsystmes to the target root
which may or may not be the dummy root.  IOW, unbinding now becomes
moving the subsystems to the dummy root and binding to non-dummy root.
This makes the dummy root mostly equivalent to other hierarchies in
terms of the mechanism of moving subsystems around; however, we still
retain all the semantical restrictions so that this patch doesn't
introduce any visible behavior differences.  Another noteworthy detail
is that rebind_subsystems() guarantees that moving a subsystem to the
dummy root never fails so that valid unmounting attempts always
succeed.

This unifies binding and unbinding of subsystems.  The invocation
points of ->bind() were inconsistent between the two and now moved
after whole rebinding is complete.  This doesn't break the current
users and generally makes more sense.

All rebind_subsystems() users are converted accordingly.  Note that
cgroup_remount() now makes two calls to rebind_subsystems() to bind
and then unbind the requested subsystems.

This will allow repurposing of the dummy hierarchy as the default
unified hierarchy and shouldn't make any userland visible behavior
difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 kernel/cgroup.c | 100 +++++++++++++++++++++++++++++++-------------------------
 1 file changed, 56 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 78017f52c69b..b632981bd9c7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -175,8 +175,8 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
-static int rebind_subsystems(struct cgroupfs_root *root,
-			     unsigned long added_mask, unsigned removed_mask);
+static int rebind_subsystems(struct cgroupfs_root *dst_root,
+			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -739,7 +739,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
-	WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
+	rebind_subsystems(&cgroup_dummy_root, root->subsys_mask);
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
@@ -976,69 +976,77 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	}
 }
 
-static int rebind_subsystems(struct cgroupfs_root *root,
-			     unsigned long added_mask, unsigned removed_mask)
+static int rebind_subsystems(struct cgroupfs_root *dst_root,
+			     unsigned long ss_mask)
 {
-	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgroup *dst_top = &dst_root->top_cgroup;
 	struct cgroup_subsys *ss;
-	int i, ret;
+	int ssid, ret;
 
 	lockdep_assert_held(&cgroup_tree_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
-	/* Check that any added subsystems are currently free */
-	for_each_subsys(ss, i)
-		if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
+	for_each_subsys(ss, ssid) {
+		if (!(ss_mask & (1 << ssid)))
+			continue;
+
+		/* if @ss is on the dummy_root, we can always move it */
+		if (ss->root == &cgroup_dummy_root)
+			continue;
+
+		/* if @ss has non-root cgroups attached to it, can't move */
+		if (!list_empty(&ss->root->top_cgroup.children))
 			return -EBUSY;
 
-	ret = cgroup_populate_dir(cgrp, added_mask);
-	if (ret)
-		return ret;
+		/* can't move between two non-dummy roots either */
+		if (dst_root != &cgroup_dummy_root)
+			return -EBUSY;
+	}
+
+	if (dst_root != &cgroup_dummy_root) {
+		ret = cgroup_populate_dir(dst_top, ss_mask);
+		if (ret)
+			return ret;
+	}
 
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
 	mutex_unlock(&cgroup_mutex);
-	cgroup_clear_dir(cgrp, removed_mask);
+	for_each_subsys(ss, ssid)
+		if ((ss_mask & (1 << ssid)) && ss->root != &cgroup_dummy_root)
+			cgroup_clear_dir(&ss->root->top_cgroup, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 
-	for_each_subsys(ss, i) {
-		unsigned long bit = 1UL << i;
-
-		if (bit & added_mask) {
-			/* We're binding this subsystem to this hierarchy */
-			BUG_ON(cgroup_css(cgrp, ss));
-			BUG_ON(!cgroup_css(cgroup_dummy_top, ss));
-			BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top);
+	for_each_subsys(ss, ssid) {
+		struct cgroupfs_root *src_root;
+		struct cgroup *src_top;
+		struct cgroup_subsys_state *css;
 
-			rcu_assign_pointer(cgrp->subsys[i],
-					   cgroup_css(cgroup_dummy_top, ss));
-			cgroup_css(cgrp, ss)->cgroup = cgrp;
+		if (!(ss_mask & (1 << ssid)))
+			continue;
 
-			ss->root = root;
-			if (ss->bind)
-				ss->bind(cgroup_css(cgrp, ss));
+		src_root = ss->root;
+		src_top = &src_root->top_cgroup;
+		css = cgroup_css(src_top, ss);
 
-			/* refcount was already taken, and we're keeping it */
-			root->subsys_mask |= bit;
-		} else if (bit & removed_mask) {
-			/* We're removing this subsystem */
-			BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss));
-			BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp);
+		WARN_ON(!css || cgroup_css(dst_top, ss));
 
-			if (ss->bind)
-				ss->bind(cgroup_css(cgroup_dummy_top, ss));
+		RCU_INIT_POINTER(src_top->subsys[ssid], NULL);
+		rcu_assign_pointer(dst_top->subsys[ssid], css);
+		ss->root = dst_root;
+		css->cgroup = dst_top;
 
-			cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top;
-			RCU_INIT_POINTER(cgrp->subsys[i], NULL);
+		src_root->subsys_mask &= ~(1 << ssid);
+		dst_root->subsys_mask |= 1 << ssid;
 
-			cgroup_subsys[i]->root = &cgroup_dummy_root;
-			root->subsys_mask &= ~bit;
-		}
+		if (ss->bind)
+			ss->bind(css);
 	}
 
-	kernfs_activate(cgrp->kn);
+	if (dst_root != &cgroup_dummy_root)
+		kernfs_activate(dst_top->kn);
 	return 0;
 }
 
@@ -1277,10 +1285,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 		goto out_unlock;
 	}
 
-	ret = rebind_subsystems(root, added_mask, removed_mask);
+	ret = rebind_subsystems(root, added_mask);
 	if (ret)
 		goto out_unlock;
 
+	rebind_subsystems(&cgroup_dummy_root, removed_mask);
+
 	if (opts.release_agent) {
 		spin_lock(&release_agent_path_lock);
 		strcpy(root->release_agent_path, opts.release_agent);
@@ -1420,7 +1430,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 	if (ret)
 		goto destroy_root;
 
-	ret = rebind_subsystems(root, ss_mask, 0);
+	ret = rebind_subsystems(root, ss_mask);
 	if (ret)
 		goto destroy_root;
 
@@ -4026,6 +4036,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	BUG_ON(online_css(css));
 
+	cgroup_dummy_root.subsys_mask |= 1 << ss->id;
+
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
 }
-- 
cgit v1.3-14-g43fede


From 944196278d3dea0cece1636de417b56897d9a108 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: move ->subsys_mask from cgroupfs_root to cgroup

cgroupfs_root->subsys_mask represents the controllers attached to the
hierarchy.  This patch moves the field to cgroup.  Subsystem
initialization and rebinding updates the top cgroup's subsys_mask.
For !root cgroups, the subsys_mask bits are set from create_css() and
cleared from kill_css(), which effectively means that all cgroups will
have the same subsys_mask as the top cgroup.

While this doesn't make any difference now, this will help
implementation of the default unified hierarchy where !root cgroups
may have subsets of the top_cgroup's subsys_mask.

While at it, __kill_css() is split out of kill_css().  The former
doesn't care about the subsys_mask while the latter becomes noop if
the controller is already killed and clears the matching bit if not
before proceeding to killing the css.  This will be used later by the
default unified hierarchy implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  6 ++---
 kernel/cgroup.c        | 61 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 42 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9f4f253f0e47..3752a0182c94 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -173,6 +173,9 @@ struct cgroup {
 	 */
 	u64 serial_nr;
 
+	/* The bitmask of subsystems attached to this cgroup */
+	unsigned long subsys_mask;
+
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -276,9 +279,6 @@ enum {
 struct cgroupfs_root {
 	struct kernfs_root *kf_root;
 
-	/* The bitmask of subsystems attached to this hierarchy */
-	unsigned long subsys_mask;
-
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b632981bd9c7..807f88dbda51 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -526,7 +526,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
-		if (root->subsys_mask & (1UL << i)) {
+		if (root->top_cgroup.subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
@@ -739,7 +739,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
-	rebind_subsystems(&cgroup_dummy_root, root->subsys_mask);
+	rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask);
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
@@ -1038,8 +1038,8 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = dst_top;
 
-		src_root->subsys_mask &= ~(1 << ssid);
-		dst_root->subsys_mask |= 1 << ssid;
+		src_top->subsys_mask &= ~(1 << ssid);
+		dst_top->subsys_mask |= 1 << ssid;
 
 		if (ss->bind)
 			ss->bind(css);
@@ -1058,7 +1058,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	int ssid;
 
 	for_each_subsys(ss, ssid)
-		if (root->subsys_mask & (1 << ssid))
+		if (root->top_cgroup.subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
@@ -1262,12 +1262,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
+	if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
-	added_mask = opts.subsys_mask & ~root->subsys_mask;
-	removed_mask = root->subsys_mask & ~opts.subsys_mask;
+	added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask;
+	removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask;
 
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1515,7 +1515,7 @@ retry:
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->subsys_mask)) {
+		    (opts.subsys_mask != root->top_cgroup.subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
@@ -3594,6 +3594,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	cgroup_get(cgrp);
 	css_get(css->parent);
 
+	cgrp->subsys_mask |= 1 << ss->id;
+
 	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
 	    parent->parent) {
 		pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
@@ -3700,7 +3702,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
-		if (root->subsys_mask & (1 << ssid)) {
+		if (root->top_cgroup.subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
@@ -3788,17 +3790,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
 	queue_work(cgroup_destroy_wq, &css->destroy_work);
 }
 
-/**
- * kill_css - destroy a css
- * @css: css to destroy
- *
- * This function initiates destruction of @css by removing cgroup interface
- * files and putting its base reference.  ->css_offline() will be invoked
- * asynchronously once css_tryget() is guaranteed to fail and when the
- * reference count reaches zero, @css will be released.
- */
-static void kill_css(struct cgroup_subsys_state *css)
+static void __kill_css(struct cgroup_subsys_state *css)
 {
+	lockdep_assert_held(&cgroup_tree_mutex);
+
 	/*
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
@@ -3824,6 +3819,28 @@ static void kill_css(struct cgroup_subsys_state *css)
 	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
 }
 
+/**
+ * kill_css - destroy a css
+ * @css: css to destroy
+ *
+ * This function initiates destruction of @css by removing cgroup interface
+ * files and putting its base reference.  ->css_offline() will be invoked
+ * asynchronously once css_tryget() is guaranteed to fail and when the
+ * reference count reaches zero, @css will be released.
+ */
+static void kill_css(struct cgroup_subsys_state *css)
+{
+	struct cgroup *cgrp = css->cgroup;
+
+	lockdep_assert_held(&cgroup_tree_mutex);
+
+	/* if already killed, noop */
+	if (cgrp->subsys_mask & (1 << css->ss->id)) {
+		cgrp->subsys_mask &= ~(1 << css->ss->id);
+		__kill_css(css);
+	}
+}
+
 /**
  * cgroup_destroy_locked - the first stage of cgroup destruction
  * @cgrp: cgroup to be destroyed
@@ -4036,7 +4053,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	BUG_ON(online_css(css));
 
-	cgroup_dummy_root.subsys_mask |= 1 << ss->id;
+	cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id;
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
@@ -4192,7 +4209,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
+			if (root->top_cgroup.subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
-- 
cgit v1.3-14-g43fede


From 3dd06ffa9df99aa88f4e01eaa0c9d3cb362430b0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: rename cgroup_dummy_root and related names

The dummy root will be repurposed to serve as the default unified
hierarchy.  Let's rename things in preparation.

* s/cgroup_dummy_root/cgrp_dfl_root/
* s/cgroupfs_root/cgroup_root/ as we don't do fs part directly anymore
* s/cgroup_root->top_cgroup/cgroup_root->cgrp/ for brevity

This is pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  14 ++---
 kernel/cgroup.c        | 168 ++++++++++++++++++++++++-------------------------
 2 files changed, 88 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3752a0182c94..77294fc66603 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -24,7 +24,7 @@
 
 #ifdef CONFIG_CGROUPS
 
-struct cgroupfs_root;
+struct cgroup_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
@@ -179,7 +179,7 @@ struct cgroup {
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 
 	/*
 	 * List of cgrp_cset_links pointing at css_sets with tasks in this
@@ -211,7 +211,7 @@ struct cgroup {
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
 
-/* cgroupfs_root->flags */
+/* cgroup_root->flags */
 enum {
 	/*
 	 * Unfortunately, cgroup core and various controllers are riddled
@@ -272,18 +272,18 @@ enum {
 };
 
 /*
- * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
+ * A cgroup_root represents the root of a cgroup hierarchy, and may be
  * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
-struct cgroupfs_root {
+struct cgroup_root {
 	struct kernfs_root *kf_root;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
 	/* The root cgroup.  Root is destroyed on its release. */
-	struct cgroup top_cgroup;
+	struct cgroup cgrp;
 
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
@@ -598,7 +598,7 @@ struct cgroup_subsys {
 	const char *name;
 
 	/* link to parent, protected by cgroup_lock() */
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 
 	/*
 	 * List of cftypes.  Each entry is the first entry of an array
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 807f88dbda51..60ea16058c42 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,14 +138,11 @@ static const char *cgroup_subsys_name[] = {
 #undef SUBSYS
 
 /*
- * The dummy hierarchy, reserved for the subsystems that are otherwise
+ * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-static struct cgroupfs_root cgroup_dummy_root;
-
-/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
-static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
+static struct cgroup_root cgrp_dfl_root;
 
 /* The list of hierarchy roots */
 
@@ -175,7 +172,7 @@ static int need_forkexit_callback __read_mostly;
 static struct cftype cgroup_base_files[];
 
 static void cgroup_put(struct cgroup *cgrp);
-static int rebind_subsystems(struct cgroupfs_root *dst_root,
+static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
 static void cgroup_destroy_css_killed(struct cgroup *cgrp);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -514,7 +511,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 					struct cgroup *cgrp,
 					struct cgroup_subsys_state *template[])
 {
-	struct cgroupfs_root *root = cgrp->root;
+	struct cgroup_root *root = cgrp->root;
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
 	unsigned long key;
@@ -526,7 +523,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 	 * won't change, so no need for locking.
 	 */
 	for_each_subsys(ss, i) {
-		if (root->top_cgroup.subsys_mask & (1UL << i)) {
+		if (root->cgrp.subsys_mask & (1UL << i)) {
 			/* Subsystem is in this hierarchy. So we want
 			 * the subsystem state from the new
 			 * cgroup */
@@ -685,14 +682,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
-	struct cgroup *top_cgrp = kf_root->kn->priv;
+	struct cgroup *root_cgrp = kf_root->kn->priv;
 
-	return top_cgrp->root;
+	return root_cgrp->root;
 }
 
-static int cgroup_init_root_id(struct cgroupfs_root *root)
+static int cgroup_init_root_id(struct cgroup_root *root)
 {
 	int id;
 
@@ -706,7 +703,7 @@ static int cgroup_init_root_id(struct cgroupfs_root *root)
 	return 0;
 }
 
-static void cgroup_exit_root_id(struct cgroupfs_root *root)
+static void cgroup_exit_root_id(struct cgroup_root *root)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -716,7 +713,7 @@ static void cgroup_exit_root_id(struct cgroupfs_root *root)
 	}
 }
 
-static void cgroup_free_root(struct cgroupfs_root *root)
+static void cgroup_free_root(struct cgroup_root *root)
 {
 	if (root) {
 		/* hierarhcy ID shoulid already have been released */
@@ -727,9 +724,9 @@ static void cgroup_free_root(struct cgroupfs_root *root)
 	}
 }
 
-static void cgroup_destroy_root(struct cgroupfs_root *root)
+static void cgroup_destroy_root(struct cgroup_root *root)
 {
-	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgroup *cgrp = &root->cgrp;
 	struct cgrp_cset_link *link, *tmp_link;
 
 	mutex_lock(&cgroup_tree_mutex);
@@ -739,7 +736,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 	BUG_ON(!list_empty(&cgrp->children));
 
 	/* Rebind all subsystems back to the default hierarchy */
-	rebind_subsystems(&cgroup_dummy_root, cgrp->subsys_mask);
+	rebind_subsystems(&cgrp_dfl_root, cgrp->subsys_mask);
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
@@ -770,7 +767,7 @@ static void cgroup_destroy_root(struct cgroupfs_root *root)
 
 /* look up cgroup associated with given css_set on the specified hierarchy */
 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
-					    struct cgroupfs_root *root)
+					    struct cgroup_root *root)
 {
 	struct cgroup *res = NULL;
 
@@ -778,7 +775,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 	lockdep_assert_held(&css_set_rwsem);
 
 	if (cset == &init_css_set) {
-		res = &root->top_cgroup;
+		res = &root->cgrp;
 	} else {
 		struct cgrp_cset_link *link;
 
@@ -801,7 +798,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
  * called with cgroup_mutex and css_set_rwsem held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
-					    struct cgroupfs_root *root)
+					    struct cgroup_root *root)
 {
 	/*
 	 * No need to lock the task - since we hold cgroup_mutex the
@@ -837,9 +834,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * A cgroup can only be deleted if both its 'count' of using tasks
  * is zero, and its list of 'children' cgroups is empty.  Since all
  * tasks in the system use _some_ cgroup, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cgroup
+ * least one task in the system (init, pid == 1), therefore, root cgroup
  * always has either children cgroups and/or using tasks.  So we don't
- * need a special hack to ensure that top_cgroup cannot be deleted.
+ * need a special hack to ensure that root cgroup cannot be deleted.
  *
  * P.S.  One more locking exception.  RCU is used to guard the
  * update of a tasks cgroup pointer by cgroup_attach_task()
@@ -905,7 +902,7 @@ static void cgroup_free_fn(struct work_struct *work)
 		kfree(cgrp);
 	} else {
 		/*
-		 * This is top cgroup's refcnt reaching zero, which
+		 * This is root cgroup's refcnt reaching zero, which
 		 * indicates that the root should be released.
 		 */
 		cgroup_destroy_root(cgrp->root);
@@ -976,10 +973,9 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	}
 }
 
-static int rebind_subsystems(struct cgroupfs_root *dst_root,
+static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
-	struct cgroup *dst_top = &dst_root->top_cgroup;
 	struct cgroup_subsys *ss;
 	int ssid, ret;
 
@@ -991,20 +987,20 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root,
 			continue;
 
 		/* if @ss is on the dummy_root, we can always move it */
-		if (ss->root == &cgroup_dummy_root)
+		if (ss->root == &cgrp_dfl_root)
 			continue;
 
 		/* if @ss has non-root cgroups attached to it, can't move */
-		if (!list_empty(&ss->root->top_cgroup.children))
+		if (!list_empty(&ss->root->cgrp.children))
 			return -EBUSY;
 
 		/* can't move between two non-dummy roots either */
-		if (dst_root != &cgroup_dummy_root)
+		if (dst_root != &cgrp_dfl_root)
 			return -EBUSY;
 	}
 
-	if (dst_root != &cgroup_dummy_root) {
-		ret = cgroup_populate_dir(dst_top, ss_mask);
+	if (dst_root != &cgrp_dfl_root) {
+		ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
 		if (ret)
 			return ret;
 	}
@@ -1015,50 +1011,48 @@ static int rebind_subsystems(struct cgroupfs_root *dst_root,
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_subsys(ss, ssid)
-		if ((ss_mask & (1 << ssid)) && ss->root != &cgroup_dummy_root)
-			cgroup_clear_dir(&ss->root->top_cgroup, 1 << ssid);
+		if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root)
+			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 
 	for_each_subsys(ss, ssid) {
-		struct cgroupfs_root *src_root;
-		struct cgroup *src_top;
+		struct cgroup_root *src_root;
 		struct cgroup_subsys_state *css;
 
 		if (!(ss_mask & (1 << ssid)))
 			continue;
 
 		src_root = ss->root;
-		src_top = &src_root->top_cgroup;
-		css = cgroup_css(src_top, ss);
+		css = cgroup_css(&src_root->cgrp, ss);
 
-		WARN_ON(!css || cgroup_css(dst_top, ss));
+		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
 
-		RCU_INIT_POINTER(src_top->subsys[ssid], NULL);
-		rcu_assign_pointer(dst_top->subsys[ssid], css);
+		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
+		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
 		ss->root = dst_root;
-		css->cgroup = dst_top;
+		css->cgroup = &dst_root->cgrp;
 
-		src_top->subsys_mask &= ~(1 << ssid);
-		dst_top->subsys_mask |= 1 << ssid;
+		src_root->cgrp.subsys_mask &= ~(1 << ssid);
+		dst_root->cgrp.subsys_mask |= 1 << ssid;
 
 		if (ss->bind)
 			ss->bind(css);
 	}
 
-	if (dst_root != &cgroup_dummy_root)
-		kernfs_activate(dst_top->kn);
+	if (dst_root != &cgrp_dfl_root)
+		kernfs_activate(dst_root->cgrp.kn);
 	return 0;
 }
 
 static int cgroup_show_options(struct seq_file *seq,
 			       struct kernfs_root *kf_root)
 {
-	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_subsys *ss;
 	int ssid;
 
 	for_each_subsys(ss, ssid)
-		if (root->top_cgroup.subsys_mask & (1 << ssid))
+		if (root->cgrp.subsys_mask & (1 << ssid))
 			seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
 		seq_puts(seq, ",sane_behavior");
@@ -1072,7 +1066,7 @@ static int cgroup_show_options(struct seq_file *seq,
 		seq_printf(seq, ",release_agent=%s", root->release_agent_path);
 	spin_unlock(&release_agent_path_lock);
 
-	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
+	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
 		seq_printf(seq, ",name=%s", root->name);
@@ -1245,7 +1239,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 {
 	int ret = 0;
-	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 
@@ -1262,12 +1256,12 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	if (opts.subsys_mask != root->top_cgroup.subsys_mask || opts.release_agent)
+	if (opts.subsys_mask != root->cgrp.subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
-	added_mask = opts.subsys_mask & ~root->top_cgroup.subsys_mask;
-	removed_mask = root->top_cgroup.subsys_mask & ~opts.subsys_mask;
+	added_mask = opts.subsys_mask & ~root->cgrp.subsys_mask;
+	removed_mask = root->cgrp.subsys_mask & ~opts.subsys_mask;
 
 	/* Don't allow flags or name to change at remount */
 	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
@@ -1280,7 +1274,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	}
 
 	/* remounting is not allowed for populated hierarchies */
-	if (!list_empty(&root->top_cgroup.children)) {
+	if (!list_empty(&root->cgrp.children)) {
 		ret = -EBUSY;
 		goto out_unlock;
 	}
@@ -1289,7 +1283,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	rebind_subsystems(&cgroup_dummy_root, removed_mask);
+	rebind_subsystems(&cgrp_dfl_root, removed_mask);
 
 	if (opts.release_agent) {
 		spin_lock(&release_agent_path_lock);
@@ -1368,10 +1362,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dummy_css.cgroup = cgrp;
 }
 
-static void init_cgroup_root(struct cgroupfs_root *root,
+static void init_cgroup_root(struct cgroup_root *root,
 			     struct cgroup_sb_opts *opts)
 {
-	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgroup *cgrp = &root->cgrp;
 
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
@@ -1385,13 +1379,13 @@ static void init_cgroup_root(struct cgroupfs_root *root,
 	if (opts->name)
 		strcpy(root->name, opts->name);
 	if (opts->cpuset_clone_children)
-		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags);
+		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
 }
 
-static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
-	struct cgroup *root_cgrp = &root->top_cgroup;
+	struct cgroup *root_cgrp = &root->cgrp;
 	struct css_set *cset;
 	int i, ret;
 
@@ -1443,7 +1437,7 @@ static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask)
 	cgroup_root_count++;
 
 	/*
-	 * Link the top cgroup in this hierarchy into all the css_set
+	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
 	down_write(&css_set_rwsem);
@@ -1472,7 +1466,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
@@ -1496,7 +1490,7 @@ retry:
 	for_each_root(root) {
 		bool name_match = false;
 
-		if (root == &cgroup_dummy_root)
+		if (root == &cgrp_dfl_root)
 			continue;
 
 		/*
@@ -1515,7 +1509,7 @@ retry:
 		 * subsystems) then they must match.
 		 */
 		if ((opts.subsys_mask || opts.none) &&
-		    (opts.subsys_mask != root->top_cgroup.subsys_mask)) {
+		    (opts.subsys_mask != root->cgrp.subsys_mask)) {
 			if (!name_match)
 				continue;
 			ret = -EBUSY;
@@ -1533,13 +1527,13 @@ retry:
 		}
 
 		/*
-		 * A root's lifetime is governed by its top cgroup.  Zero
+		 * A root's lifetime is governed by its root cgroup.  Zero
 		 * ref indicate that the root is being destroyed.  Wait for
 		 * destruction to complete so that the subsystems are free.
 		 * We can use wait_queue for the wait but this path is
 		 * super cold.  Let's just sleep for a bit and retry.
 		 */
-		if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) {
+		if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
 			mutex_unlock(&cgroup_mutex);
 			mutex_unlock(&cgroup_tree_mutex);
 			kfree(opts.release_agent);
@@ -1586,16 +1580,16 @@ out_unlock:
 
 	dentry = kernfs_mount(fs_type, flags, root->kf_root);
 	if (IS_ERR(dentry))
-		cgroup_put(&root->top_cgroup);
+		cgroup_put(&root->cgrp);
 	return dentry;
 }
 
 static void cgroup_kill_sb(struct super_block *sb)
 {
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
-	struct cgroupfs_root *root = cgroup_root_from_kf(kf_root);
+	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
 
-	cgroup_put(&root->top_cgroup);
+	cgroup_put(&root->cgrp);
 	kernfs_kill_sb(sb);
 }
 
@@ -1622,7 +1616,7 @@ static struct kobject *cgroup_kobj;
  */
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 {
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 	struct cgroup *cgrp;
 	int hierarchy_id = 1;
 	char *path = NULL;
@@ -2112,14 +2106,14 @@ out_unlock_cgroup:
  */
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 {
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 	int retval = 0;
 
 	mutex_lock(&cgroup_mutex);
 	for_each_root(root) {
 		struct cgroup *from_cgrp;
 
-		if (root == &cgroup_dummy_root)
+		if (root == &cgrp_dfl_root)
 			continue;
 
 		down_read(&css_set_rwsem);
@@ -2151,7 +2145,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, const char *buffer)
 {
-	struct cgroupfs_root *root = css->cgroup->root;
+	struct cgroup_root *root = css->cgroup->root;
 
 	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
 	if (!cgroup_lock_live_group(css->cgroup))
@@ -2362,14 +2356,14 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 {
 	LIST_HEAD(pending);
 	struct cgroup_subsys *ss = cfts[0].ss;
-	struct cgroup *root = &ss->root->top_cgroup;
+	struct cgroup *root = &ss->root->cgrp;
 	struct cgroup_subsys_state *css;
 	int ret = 0;
 
 	lockdep_assert_held(&cgroup_tree_mutex);
 
 	/* don't bother if @ss isn't attached */
-	if (ss->root == &cgroup_dummy_root)
+	if (ss->root == &cgrp_dfl_root)
 		return 0;
 
 	/* add/rm files for all cgroups created before */
@@ -3623,7 +3617,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 			  umode_t mode)
 {
 	struct cgroup *cgrp;
-	struct cgroupfs_root *root = parent->root;
+	struct cgroup_root *root = parent->root;
 	int ssid, err;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
@@ -3702,7 +3696,7 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 
 	/* let's create and online css's */
 	for_each_subsys(ss, ssid) {
-		if (root->top_cgroup.subsys_mask & (1 << ssid)) {
+		if (root->cgrp.subsys_mask & (1 << ssid)) {
 			err = create_css(cgrp, ss);
 			if (err)
 				goto err_destroy;
@@ -4031,17 +4025,17 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	INIT_LIST_HEAD(&ss->cfts);
 
-	/* Create the top cgroup state for this subsystem */
-	ss->root = &cgroup_dummy_root;
-	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss));
+	/* Create the root cgroup state for this subsystem */
+	ss->root = &cgrp_dfl_root;
+	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
 	/* We don't handle early failures gracefully */
 	BUG_ON(IS_ERR(css));
-	init_css(css, ss, cgroup_dummy_top);
+	init_css(css, ss, &cgrp_dfl_root.cgrp);
 
 	/* Update the init_css_set to contain a subsys
 	 * pointer to this state - since the subsystem is
 	 * newly registered, all tasks and hence the
-	 * init_css_set is in the subsystem's top cgroup. */
+	 * init_css_set is in the subsystem's root cgroup. */
 	init_css_set.subsys[ss->id] = css;
 
 	need_forkexit_callback |= ss->fork || ss->exit;
@@ -4053,7 +4047,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 
 	BUG_ON(online_css(css));
 
-	cgroup_dummy_root.top_cgroup.subsys_mask |= 1 << ss->id;
+	cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
@@ -4071,7 +4065,7 @@ int __init cgroup_init_early(void)
 	struct cgroup_subsys *ss;
 	int i;
 
-	init_cgroup_root(&cgroup_dummy_root, &opts);
+	init_cgroup_root(&cgrp_dfl_root, &opts);
 	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
 
 	for_each_subsys(ss, i) {
@@ -4112,7 +4106,7 @@ int __init cgroup_init(void)
 	key = css_set_hash(init_css_set.subsys);
 	hash_add(css_set_table, &init_css_set.hlist, key);
 
-	BUG_ON(cgroup_setup_root(&cgroup_dummy_root, 0));
+	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
@@ -4181,7 +4175,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 	struct task_struct *tsk;
 	char *buf, *path;
 	int retval;
-	struct cgroupfs_root *root;
+	struct cgroup_root *root;
 
 	retval = -ENOMEM;
 	buf = kmalloc(PATH_MAX, GFP_KERNEL);
@@ -4204,12 +4198,12 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 
-		if (root == &cgroup_dummy_root)
+		if (root == &cgrp_dfl_root)
 			continue;
 
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(ss, ssid)
-			if (root->top_cgroup.subsys_mask & (1 << ssid))
+			if (root->cgrp.subsys_mask & (1 << ssid))
 				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
@@ -4639,7 +4633,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 		struct cgroup *c = link->cgrp;
 		const char *name = "?";
 
-		if (c != cgroup_dummy_top) {
+		if (c != &cgrp_dfl_root.cgrp) {
 			cgroup_name(c, name_buf, NAME_MAX + 1);
 			name = name_buf;
 		}
-- 
cgit v1.3-14-g43fede


From 4d3bb511b5f9980fc3e9ae5939ebc475b231d3fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:54 -0400
Subject: cgroup: drop const from @buffer of cftype->write_string()

cftype->write_string() just passes on the writeable buffer from kernfs
and there's no reason to add const restriction on the buffer.  The
only thing const achieves is unnecessarily complicating parsing of the
buffer.  Drop const from @buffer.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 block/blk-throttle.c      | 4 ++--
 block/cfq-iosched.c       | 4 ++--
 include/linux/cgroup.h    | 2 +-
 kernel/cgroup.c           | 2 +-
 kernel/cgroup_freezer.c   | 2 +-
 kernel/cpuset.c           | 2 +-
 mm/hugetlb_cgroup.c       | 2 +-
 mm/memcontrol.c           | 4 ++--
 net/core/netprio_cgroup.c | 2 +-
 net/ipv4/tcp_memcontrol.c | 2 +-
 security/device_cgroup.c  | 4 ++--
 11 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 861c363e4129..033745cd7fba 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1408,13 +1408,13 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft,
 }
 
 static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft,
-			   const char *buf)
+			   char *buf)
 {
 	return tg_set_conf(css, cft, buf, true);
 }
 
 static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft,
-			    const char *buf)
+			    char *buf)
 {
 	return tg_set_conf(css, cft, buf, false);
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 461187943392..f5de45b6af36 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1701,13 +1701,13 @@ static int __cfqg_set_weight_device(struct cgroup_subsys_state *css,
 }
 
 static int cfqg_set_weight_device(struct cgroup_subsys_state *css,
-				  struct cftype *cft, const char *buf)
+				  struct cftype *cft, char *buf)
 {
 	return __cfqg_set_weight_device(css, cft, buf, false);
 }
 
 static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css,
-				       struct cftype *cft, const char *buf)
+				       struct cftype *cft, char *buf)
 {
 	return __cfqg_set_weight_device(css, cft, buf, true);
 }
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 77294fc66603..79993ac066c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -454,7 +454,7 @@ struct cftype {
 	 * Returns 0 or -ve error code.
 	 */
 	int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft,
-			    const char *buffer);
+			    char *buffer);
 	/*
 	 * trigger() callback can be used to get some kick from the
 	 * userspace, when the actual string written is not important
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 60ea16058c42..f5754910e80b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2143,7 +2143,7 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css,
 }
 
 static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
-				      struct cftype *cft, const char *buffer)
+				      struct cftype *cft, char *buffer)
 {
 	struct cgroup_root *root = css->cgroup->root;
 
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2ea98b216bff..2bc4a2256444 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -442,7 +442,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
 }
 
 static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			 const char *buffer)
+			 char *buffer)
 {
 	bool freeze;
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8d5324583aa4..efbf9baf77ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1610,7 +1610,7 @@ out_unlock:
  * Common handling for a write to a "cpus" or "mems" file.
  */
 static int cpuset_write_resmask(struct cgroup_subsys_state *css,
-				struct cftype *cft, const char *buf)
+				struct cftype *cft, char *buf)
 {
 	struct cpuset *cs = css_cs(css);
 	struct cpuset *trialcs;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index b135853e68f3..595d7fd795e1 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -254,7 +254,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
 }
 
 static int hugetlb_cgroup_write(struct cgroup_subsys_state *css,
-				struct cftype *cft, const char *buffer)
+				struct cftype *cft, char *buffer)
 {
 	int idx, name, ret;
 	unsigned long long val;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d9c6ac1532e6..96f94a9f2faf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5242,7 +5242,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			    const char *buffer)
+			    char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	enum res_type type;
@@ -6063,7 +6063,7 @@ static void memcg_event_ptable_queue_proc(struct file *file,
  * Interpretation of args is defined by control file implementation.
  */
 static int memcg_write_event_control(struct cgroup_subsys_state *css,
-				     struct cftype *cft, const char *buffer)
+				     struct cftype *cft, char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event;
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index f9f3a40d3350..3825f669147b 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -186,7 +186,7 @@ static int read_priomap(struct seq_file *sf, void *v)
 }
 
 static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft,
-			 const char *buffer)
+			 char *buffer)
 {
 	char devname[IFNAMSIZ + 1];
 	struct net_device *dev;
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 20a0aca9131e..d4f015ad6c84 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -103,7 +103,7 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 }
 
 static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
-			    const char *buffer)
+			    char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	unsigned long long val;
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 7f88bcde7c61..8365909f5f8c 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -496,7 +496,7 @@ static inline bool has_children(struct dev_cgroup *devcgroup)
  * parent cgroup has the access you're asking for.
  */
 static int devcgroup_update_access(struct dev_cgroup *devcgroup,
-				   int filetype, const char *buffer)
+				   int filetype, char *buffer)
 {
 	const char *b;
 	char temp[12];		/* 11 + 1 characters needed for a u32 */
@@ -652,7 +652,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 }
 
 static int devcgroup_access_write(struct cgroup_subsys_state *css,
-				  struct cftype *cft, const char *buffer)
+				  struct cftype *cft, char *buffer)
 {
 	int retval;
 
-- 
cgit v1.3-14-g43fede


From a2dd424750807f83632a2a70293961086fd8f870 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:55 -0400
Subject: cgroup: make cgrp_dfl_root mountable

cgrp_dfl_root will be used as the default unified hierarchy.  This
patch makes cgrp_dfl_root mountable by making the following changes.

* cgroup_init_early() now initializes cgrp_dfl_root w/
  CGRP_ROOT_SANE_BEHAVIOR.  The default hierarchy is always sane.

* parse_cgroupfs_options() and cgroup_mount() are updated such that
  cgrp_dfl_root is mounted if sane_behavior is specified w/o any
  subsystems.

* rebind_subsystems() now populates the root directory of
  cgrp_dfl_root.  Note that the function still guarantees success of
  rebinding subsystems to cgrp_dfl_root.  If populating fails while
  rebinding to cgrp_dfl_root, it whines but ignores the error.

* For backward compatibility, the default hierarchy shows up in
  /proc/$PID/cgroup only after it's explicitly mounted so that
  userland which doesn't make use of it doesn't see any change.

* "current_css_set_cg_links" file of debug cgroup now treats the
  default hierarchy the same as other hierarchies.  This is visible to
  userland.  Given that it's for debug controller, this should be
  fine.

* While at it, implement cgroup_on_dfl() which tests whether a give
  cgroup is on the default hierarchy or not.

The above changes make cgrp_dfl_root mostly equivalent to other
controllers but the actual unified hierarchy behaviors are not
implemented yet.  Let's plug child cgroup creation in cgrp_dfl_root
from create_cgroup() for now.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 10 ++++++
 kernel/cgroup.c        | 94 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 71 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 79993ac066c5..7e9fa505b7bb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -250,6 +250,9 @@ enum {
 	 *
 	 * - "cgroup.clone_children" is removed.
 	 *
+	 * - If mount is requested with sane_behavior but without any
+	 *   subsystem, the default unified hierarchy is mounted.
+	 *
 	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
@@ -468,6 +471,13 @@ struct cftype {
 #endif
 };
 
+extern struct cgroup_root cgrp_dfl_root;
+
+static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+	return cgrp->root == &cgrp_dfl_root;
+}
+
 /*
  * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
  * function can be called as long as @cgrp is accessible.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f5754910e80b..69b4939e9f6d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -142,7 +142,13 @@ static const char *cgroup_subsys_name[] = {
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-static struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root;
+
+/*
+ * The default hierarchy always exists but is hidden until mounted for the
+ * first time.  This is for backward compatibility.
+ */
+static bool cgrp_dfl_root_visible;
 
 /* The list of hierarchy roots */
 
@@ -999,10 +1005,22 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 			return -EBUSY;
 	}
 
-	if (dst_root != &cgrp_dfl_root) {
-		ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
-		if (ret)
+	ret = cgroup_populate_dir(&dst_root->cgrp, ss_mask);
+	if (ret) {
+		if (dst_root != &cgrp_dfl_root)
 			return ret;
+
+		/*
+		 * Rebinding back to the default root is not allowed to
+		 * fail.  Using both default and non-default roots should
+		 * be rare.  Moving subsystems back and forth even more so.
+		 * Just warn about it and continue.
+		 */
+		if (cgrp_dfl_root_visible) {
+			pr_warning("cgroup: failed to create files (%d) while rebinding 0x%lx to default root\n",
+				   ret, ss_mask);
+			pr_warning("cgroup: you may retry by moving them to a different hierarchy and unbinding\n");
+		}
 	}
 
 	/*
@@ -1011,7 +1029,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 	 */
 	mutex_unlock(&cgroup_mutex);
 	for_each_subsys(ss, ssid)
-		if ((ss_mask & (1 << ssid)) && ss->root != &cgrp_dfl_root)
+		if (ss_mask & (1 << ssid))
 			cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 	mutex_lock(&cgroup_mutex);
 
@@ -1039,8 +1057,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 			ss->bind(css);
 	}
 
-	if (dst_root != &cgrp_dfl_root)
-		kernfs_activate(dst_root->cgrp.kn);
+	kernfs_activate(dst_root->cgrp.kn);
 	return 0;
 }
 
@@ -1190,16 +1207,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			return -ENOENT;
 	}
 
-	/*
-	 * If the 'all' option was specified select all the subsystems,
-	 * otherwise if 'none', 'name=' and a subsystem name options
-	 * were not specified, let's default to 'all'
-	 */
-	if (all_ss || (!one_ss && !opts->none && !opts->name))
-		for_each_subsys(ss, i)
-			if (!ss->disabled)
-				set_bit(i, &opts->subsys_mask);
-
 	/* Consistency checks */
 
 	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
@@ -1211,6 +1218,23 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
 			return -EINVAL;
 		}
+	} else {
+		/*
+		 * If the 'all' option was specified select all the
+		 * subsystems, otherwise if 'none', 'name=' and a subsystem
+		 * name options were not specified, let's default to 'all'
+		 */
+		if (all_ss || (!one_ss && !opts->none && !opts->name))
+			for_each_subsys(ss, i)
+				if (!ss->disabled)
+					set_bit(i, &opts->subsys_mask);
+
+		/*
+		 * We either have to specify by name or by subsystems. (So
+		 * all empty hierarchies must have a name).
+		 */
+		if (!opts->subsys_mask && !opts->name)
+			return -EINVAL;
 	}
 
 	/*
@@ -1226,13 +1250,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	if (opts->subsys_mask && opts->none)
 		return -EINVAL;
 
-	/*
-	 * We either have to specify by name or by subsystems. (So all
-	 * empty hierarchies must have a name).
-	 */
-	if (!opts->subsys_mask && !opts->name)
-		return -EINVAL;
-
 	return 0;
 }
 
@@ -1487,6 +1504,14 @@ retry:
 		goto out_unlock;
 
 	/* look for a matching existing root */
+	if (!opts.subsys_mask && !opts.none && !opts.name) {
+		cgrp_dfl_root_visible = true;
+		root = &cgrp_dfl_root;
+		cgroup_get(&root->cgrp);
+		ret = 0;
+		goto out_unlock;
+	}
+
 	for_each_root(root) {
 		bool name_match = false;
 
@@ -3622,6 +3647,13 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
 
+	/*
+	 * XXX: The default hierarchy isn't fully implemented yet.  Block
+	 * !root cgroup creation on it for now.
+	 */
+	if (root == &cgrp_dfl_root)
+		return -EINVAL;
+
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
 	if (!cgrp)
@@ -4061,7 +4093,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  */
 int __init cgroup_init_early(void)
 {
-	static struct cgroup_sb_opts __initdata opts = { };
+	static struct cgroup_sb_opts __initdata opts =
+		{ .flags = CGRP_ROOT_SANE_BEHAVIOR };
 	struct cgroup_subsys *ss;
 	int i;
 
@@ -4198,7 +4231,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 		struct cgroup *cgrp;
 		int ssid, count = 0;
 
-		if (root == &cgrp_dfl_root)
+		if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
 			continue;
 
 		seq_printf(m, "%d:", root->hierarchy_id);
@@ -4631,15 +4664,10 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
-		const char *name = "?";
-
-		if (c != &cgrp_dfl_root.cgrp) {
-			cgroup_name(c, name_buf, NAME_MAX + 1);
-			name = name_buf;
-		}
 
+		cgroup_name(c, name_buf, NAME_MAX + 1);
 		seq_printf(seq, "Root %d group %s\n",
-			   c->root->hierarchy_id, name);
+			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
 	up_read(&css_set_rwsem);
-- 
cgit v1.3-14-g43fede


From 8cbbf2c972c4444cad36f61cd571714c39b8cf04 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 10:23:55 -0400
Subject: cgroup: implement CFTYPE_ONLY_ON_DFL

This cftype flag makes the file only appear on the default hierarchy.
This will later be used for cgroup.controllers file.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7e9fa505b7bb..43d1ed30bae3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -384,6 +384,7 @@ enum {
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
+	CFTYPE_ONLY_ON_DFL	= (1 << 4),	/* only on default hierarchy */
 };
 
 #define MAX_CFTYPE_NAME		64
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 69b4939e9f6d..37b6d534b0ca 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2356,6 +2356,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+			continue;
 		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
 			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
-- 
cgit v1.3-14-g43fede


From 6404e88e8385638123f4b18b104430480870601a Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 7 Mar 2014 09:22:19 -0700
Subject: resources: Set type in __request_region()

We don't set the type (I/O, memory, etc.) of resources added by
__request_region(), which leads to confusing messages like this:

    address space collision: [io  0x1000-0x107f] conflicts with ACPI CPU throttle [??? 0x00001010-0x00001015 flags 0x80000000]

Set the type of a new resource added by __request_region() (used by
request_region() and request_mem_region()) to the type of its parent.  This
makes the resource tree internally consistent and fixes messages like the
above, where the ACPI CPU throttle resource really is an I/O port region,
but request_region() didn't fill in the type, so %pR didn't know how to
print it.

Sample dmesg showing the issue at the link below.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=71611
Reported-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 kernel/resource.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index a8344dda7049..673061c06da1 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -945,8 +945,8 @@ struct resource * __request_region(struct resource *parent,
 	res->name = name;
 	res->start = start;
 	res->end = start + n - 1;
-	res->flags = IORESOURCE_BUSY;
-	res->flags |= flags;
+	res->flags = resource_type(parent);
+	res->flags |= IORESOURCE_BUSY | flags;
 
 	write_lock(&resource_lock);
 
-- 
cgit v1.3-14-g43fede


From 1b9aba49eab5e85b0d3de8ba630cda6d68546297 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 19 Mar 2014 17:43:21 -0400
Subject: cgroup: fix cgroup_taskset walking order

cgroup_taskset is used to track and iterate target tasks while
migrating a task or process and should guarantee that the first task
iterated is the task group leader if a process is being migrated.

b3dc094e9390 ("cgroup: use css_set->mg_tasks to track target tasks
during migration") replaced flex array cgroup_taskset->tc_array with
css_set->mg_tasks list to remove process size limit and dynamic
allocation during migration; unfortunately, it incorrectly used list
operations which don't preserve order breaking the guarantee that
cgroup_taskset_first() returns the leader for a process target.

Fix it by using order preserving list operations.  Note that as
multiple src_csets may map to a single dst_cset, the iteration order
may change across cgroup_task_migrate(); however, the leader is still
guaranteed to be the first entry.

The switch to list_splice_tail_init() at the end of cgroup_migrate()
isn't strictly necessary.  Let's still do it for consistency.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 37b6d534b0ca..98a8045e2149 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1761,7 +1761,14 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
-	list_move(&tsk->cg_list, &new_cset->mg_tasks);
+
+	/*
+	 * Use move_tail so that cgroup_taskset_first() still returns the
+	 * leader after migration.  This works because cgroup_migrate()
+	 * ensures that the dst_cset of the leader is the first on the
+	 * tset's dst_csets list.
+	 */
+	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 
 	/*
 	 * We just gained a reference on old_cset by taking it from the
@@ -1936,9 +1943,16 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 		if (!cset->mg_src_cgrp)
 			goto next;
 
-		list_move(&task->cg_list, &cset->mg_tasks);
-		list_move(&cset->mg_node, &tset.src_csets);
-		list_move(&cset->mg_dst_cset->mg_node, &tset.dst_csets);
+		/*
+		 * cgroup_taskset_first() must always return the leader.
+		 * Take care to avoid disturbing the ordering.
+		 */
+		list_move_tail(&task->cg_list, &cset->mg_tasks);
+		if (list_empty(&cset->mg_node))
+			list_add_tail(&cset->mg_node, &tset.src_csets);
+		if (list_empty(&cset->mg_dst_cset->mg_node))
+			list_move_tail(&cset->mg_dst_cset->mg_node,
+				       &tset.dst_csets);
 	next:
 		if (!threadgroup)
 			break;
@@ -1999,7 +2013,7 @@ out_release_tset:
 	down_write(&css_set_rwsem);
 	list_splice_init(&tset.dst_csets, &tset.src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
-		list_splice_init(&cset->mg_tasks, &cset->tasks);
+		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
 	up_write(&css_set_rwsem);
-- 
cgit v1.3-14-g43fede


From c41eba7de133e43ea2c998ccd484059feab200f6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 18 Mar 2014 13:23:15 +0530
Subject: timer: Use variable head instead of &work_list in __run_timers()

We already have a variable 'head' that points to '&work_list', and so
we should use that instead wherever possible.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Link: http://lkml.kernel.org/r/0d8645a6efc8360c4196c9797d59343abbfdcc5e.1395129136.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 949d74ea0ce4..8e503fec1fba 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1192,7 +1192,7 @@ static inline void __run_timers(struct tvec_base *base)
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
-		list_replace_init(base->tv1.vec + index, &work_list);
+		list_replace_init(base->tv1.vec + index, head);
 		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
-- 
cgit v1.3-14-g43fede


From 6201b4d61fbf194df6371fb3376c5026cb8f5eec Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 18 Mar 2014 16:26:07 +0530
Subject: timer: Remove code redundancy while calling get_nohz_timer_target()

There are only two users of get_nohz_timer_target(): timer and hrtimer. Both
call it under same circumstances, i.e.

	#ifdef CONFIG_NO_HZ_COMMON
	       if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
	               return get_nohz_timer_target();
	#endif

So, it makes more sense to get all this as part of get_nohz_timer_target()
instead of duplicating code at two places. For this another parameter is
required to be passed to this routine, pinned.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/1e1b53537217d58d48c2d7a222a9c3ac47d5b64c.1395140107.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  6 +++++-
 kernel/hrtimer.c      | 15 +--------------
 kernel/sched/core.c   |  5 ++++-
 kernel/timer.c        |  7 +------
 4 files changed, 11 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68a0e84463a0..6f6c56f63c68 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -291,10 +291,14 @@ extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(void);
+extern int get_nohz_timer_target(int pinned);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
+static inline int get_nohz_timer_target(int pinned)
+{
+	return smp_processor_id();
+}
 #endif
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 09094361dce5..d55092ceee29 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -168,19 +168,6 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 	}
 }
 
-
-/*
- * Get the preferred target CPU for NOHZ
- */
-static int hrtimer_get_target(int this_cpu, int pinned)
-{
-#ifdef CONFIG_NO_HZ_COMMON
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
-		return get_nohz_timer_target();
-#endif
-	return this_cpu;
-}
-
 /*
  * With HIGHRES=y we do not migrate the timer when it is expiring
  * before the next event on the target cpu because we cannot reprogram
@@ -214,7 +201,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 	struct hrtimer_clock_base *new_base;
 	struct hrtimer_cpu_base *new_cpu_base;
 	int this_cpu = smp_processor_id();
-	int cpu = hrtimer_get_target(this_cpu, pinned);
+	int cpu = get_nohz_timer_target(pinned);
 	int basenum = base->index;
 
 again:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..c0339e206cc2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -555,12 +555,15 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(void)
+int get_nohz_timer_target(int pinned)
 {
 	int cpu = smp_processor_id();
 	int i;
 	struct sched_domain *sd;
 
+	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+		return cpu;
+
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
diff --git a/kernel/timer.c b/kernel/timer.c
index 8e503fec1fba..1d35ddadc045 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -760,12 +760,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = smp_processor_id();
-
-#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
-	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
-		cpu = get_nohz_timer_target();
-#endif
+	cpu = get_nohz_timer_target(pinned);
 	new_base = per_cpu(tvec_bases, cpu);
 
 	if (base != new_base) {
-- 
cgit v1.3-14-g43fede


From a19423b98704aa85e84097be6d1d44a8615c2340 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Tue, 11 Mar 2014 02:04:03 +0530
Subject: CPU hotplug: Add lockdep annotations to get/put_online_cpus()

Add lockdep annotations for get/put_online_cpus() and
cpu_hotplug_begin()/cpu_hotplug_end().

Cc: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/cpu.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index deff2e693766..33caf5e97701 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -19,6 +19,7 @@
 #include <linux/mutex.h>
 #include <linux/gfp.h>
 #include <linux/suspend.h>
+#include <linux/lockdep.h>
 
 #include "smpboot.h"
 
@@ -57,17 +58,30 @@ static struct {
 	 * an ongoing cpu hotplug operation.
 	 */
 	int refcount;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
 } cpu_hotplug = {
 	.active_writer = NULL,
 	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
 	.refcount = 0,
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	.dep_map = {.name = "cpu_hotplug.lock" },
+#endif
 };
 
+/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
+#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
+#define cpuhp_lock_acquire()      lock_map_acquire(&cpu_hotplug.dep_map)
+#define cpuhp_lock_release()      lock_map_release(&cpu_hotplug.dep_map)
+
 void get_online_cpus(void)
 {
 	might_sleep();
 	if (cpu_hotplug.active_writer == current)
 		return;
+	cpuhp_lock_acquire_read();
 	mutex_lock(&cpu_hotplug.lock);
 	cpu_hotplug.refcount++;
 	mutex_unlock(&cpu_hotplug.lock);
@@ -87,6 +101,7 @@ void put_online_cpus(void)
 	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
 		wake_up_process(cpu_hotplug.active_writer);
 	mutex_unlock(&cpu_hotplug.lock);
+	cpuhp_lock_release();
 
 }
 EXPORT_SYMBOL_GPL(put_online_cpus);
@@ -117,6 +132,7 @@ void cpu_hotplug_begin(void)
 {
 	cpu_hotplug.active_writer = current;
 
+	cpuhp_lock_acquire();
 	for (;;) {
 		mutex_lock(&cpu_hotplug.lock);
 		if (likely(!cpu_hotplug.refcount))
@@ -131,6 +147,7 @@ void cpu_hotplug_done(void)
 {
 	cpu_hotplug.active_writer = NULL;
 	mutex_unlock(&cpu_hotplug.lock);
+	cpuhp_lock_release();
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 93ae4f978ca7f26d17df915ac7afc919c1dd0353 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 11 Mar 2014 02:04:14 +0530
Subject: CPU hotplug: Provide lockless versions of callback registration
 functions

The following method of CPU hotplug callback registration is not safe
due to the possibility of an ABBA deadlock involving the cpu_add_remove_lock
and the cpu_hotplug.lock.

	get_online_cpus();

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	register_cpu_notifier(&foobar_cpu_notifier);

	put_online_cpus();

The deadlock is shown below:

          CPU 0                                         CPU 1
          -----                                         -----

   Acquire cpu_hotplug.lock
   [via get_online_cpus()]

                                              CPU online/offline operation
                                              takes cpu_add_remove_lock
                                              [via cpu_maps_update_begin()]

   Try to acquire
   cpu_add_remove_lock
   [via register_cpu_notifier()]

                                              CPU online/offline operation
                                              tries to acquire cpu_hotplug.lock
                                              [via cpu_hotplug_begin()]

                            *** DEADLOCK! ***

The problem here is that callback registration takes the locks in one order
whereas the CPU hotplug operations take the same locks in the opposite order.
To avoid this issue and to provide a race-free method to register CPU hotplug
callbacks (along with initialization of already online CPUs), introduce new
variants of the callback registration APIs that simply register the callbacks
without holding the cpu_add_remove_lock during the registration. That way,
we can avoid the ABBA scenario. However, we will need to hold the
cpu_add_remove_lock throughout the entire critical section, to protect updates
to the callback/notifier chain.

This can be achieved by writing the callback registration code as follows:

	cpu_maps_update_begin(); [ or cpu_notifier_register_begin(); see below ]

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	/* This doesn't take the cpu_add_remove_lock */
	__register_cpu_notifier(&foobar_cpu_notifier);

	cpu_maps_update_done();  [ or cpu_notifier_register_done(); see below ]

Note that we can't use get_online_cpus() here instead of cpu_maps_update_begin()
because the cpu_hotplug.lock is dropped during the invocation of CPU_POST_DEAD
notifiers, and hence get_online_cpus() cannot provide the necessary
synchronization to protect the callback/notifier chains against concurrent
reads and writes. On the other hand, since the cpu_add_remove_lock protects
the entire hotplug operation (including CPU_POST_DEAD), we can use
cpu_maps_update_begin/done() to guarantee proper synchronization.

Also, since cpu_maps_update_begin/done() is like a super-set of
get/put_online_cpus(), the former naturally protects the critical sections
from concurrent hotplug operations.

Since the names cpu_maps_update_begin/done() don't make much sense in CPU
hotplug callback registration scenarios, we'll introduce new APIs named
cpu_notifier_register_begin/done() and map them to cpu_maps_update_begin/done().

In summary, introduce the lockless variants of un/register_cpu_notifier() and
also export the cpu_notifier_register_begin/done() APIs for use by modules.
This way, we provide a race-free way to register hotplug callbacks as well as
perform initialization for the CPUs that are already online.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpu.h | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/cpu.c        | 21 +++++++++++++++++++--
 2 files changed, 66 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 03e235ad1bba..488d6ebcf6a1 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -122,26 +122,46 @@ enum {
 		{ .notifier_call = fn, .priority = pri };	\
 	register_cpu_notifier(&fn##_nb);			\
 }
+
+#define __cpu_notifier(fn, pri) {				\
+	static struct notifier_block fn##_nb =			\
+		{ .notifier_call = fn, .priority = pri };	\
+	__register_cpu_notifier(&fn##_nb);			\
+}
 #else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
 #define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int __register_cpu_notifier(struct notifier_block *nb);
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+extern void __unregister_cpu_notifier(struct notifier_block *nb);
 #else
 
 #ifndef MODULE
 extern int register_cpu_notifier(struct notifier_block *nb);
+extern int __register_cpu_notifier(struct notifier_block *nb);
 #else
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
+
+static inline int __register_cpu_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
 #endif
 
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
+
+static inline void __unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
 #endif
 
 int cpu_up(unsigned int cpu);
@@ -149,19 +169,32 @@ void notify_cpu_starting(unsigned int cpu);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
 
+#define cpu_notifier_register_begin	cpu_maps_update_begin
+#define cpu_notifier_register_done	cpu_maps_update_done
+
 #else	/* CONFIG_SMP */
 
 #define cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define __cpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 	return 0;
 }
 
+static inline int __register_cpu_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
 
+static inline void __unregister_cpu_notifier(struct notifier_block *nb)
+{
+}
+
 static inline void cpu_maps_update_begin(void)
 {
 }
@@ -170,6 +203,14 @@ static inline void cpu_maps_update_done(void)
 {
 }
 
+static inline void cpu_notifier_register_begin(void)
+{
+}
+
+static inline void cpu_notifier_register_done(void)
+{
+}
+
 #endif /* CONFIG_SMP */
 extern struct bus_type cpu_subsys;
 
@@ -183,8 +224,11 @@ extern void put_online_cpus(void);
 extern void cpu_hotplug_disable(void);
 extern void cpu_hotplug_enable(void);
 #define hotcpu_notifier(fn, pri)	cpu_notifier(fn, pri)
+#define __hotcpu_notifier(fn, pri)	__cpu_notifier(fn, pri)
 #define register_hotcpu_notifier(nb)	register_cpu_notifier(nb)
+#define __register_hotcpu_notifier(nb)	__register_cpu_notifier(nb)
 #define unregister_hotcpu_notifier(nb)	unregister_cpu_notifier(nb)
+#define __unregister_hotcpu_notifier(nb)	__unregister_cpu_notifier(nb)
 void clear_tasks_mm_cpumask(int cpu);
 int cpu_down(unsigned int cpu);
 
@@ -197,9 +241,12 @@ static inline void cpu_hotplug_done(void) {}
 #define cpu_hotplug_disable()	do { } while (0)
 #define cpu_hotplug_enable()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
+#define __hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
+#define __register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
 #define unregister_hotcpu_notifier(nb)	({ (void)(nb); })
+#define __unregister_hotcpu_notifier(nb)	({ (void)(nb); })
 #endif		/* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_PM_SLEEP_SMP
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 33caf5e97701..a9e710eef0e2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -28,18 +28,23 @@
 static DEFINE_MUTEX(cpu_add_remove_lock);
 
 /*
- * The following two API's must be used when attempting
- * to serialize the updates to cpu_online_mask, cpu_present_mask.
+ * The following two APIs (cpu_maps_update_begin/done) must be used when
+ * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
+ * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
+ * hotplug callback (un)registration performed using __register_cpu_notifier()
+ * or __unregister_cpu_notifier().
  */
 void cpu_maps_update_begin(void)
 {
 	mutex_lock(&cpu_add_remove_lock);
 }
+EXPORT_SYMBOL(cpu_notifier_register_begin);
 
 void cpu_maps_update_done(void)
 {
 	mutex_unlock(&cpu_add_remove_lock);
 }
+EXPORT_SYMBOL(cpu_notifier_register_done);
 
 static RAW_NOTIFIER_HEAD(cpu_chain);
 
@@ -183,6 +188,11 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
 	return ret;
 }
 
+int __ref __register_cpu_notifier(struct notifier_block *nb)
+{
+	return raw_notifier_chain_register(&cpu_chain, nb);
+}
+
 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
 			int *nr_calls)
 {
@@ -206,6 +216,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
 	BUG_ON(cpu_notify(val, v));
 }
 EXPORT_SYMBOL(register_cpu_notifier);
+EXPORT_SYMBOL(__register_cpu_notifier);
 
 void __ref unregister_cpu_notifier(struct notifier_block *nb)
 {
@@ -215,6 +226,12 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
+void __ref __unregister_cpu_notifier(struct notifier_block *nb)
+{
+	raw_notifier_chain_unregister(&cpu_chain, nb);
+}
+EXPORT_SYMBOL(__unregister_cpu_notifier);
+
 /**
  * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
  * @cpu: a CPU id
-- 
cgit v1.3-14-g43fede


From d39ad278a3001c860da4d7c13e51259b1904bec5 Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 11 Mar 2014 02:11:56 +0530
Subject: trace, ring-buffer: Fix CPU hotplug callback registration

Subsystems that want to register CPU hotplug callbacks, as well as perform
initialization for the CPUs that are already online, often do it as shown
below:

	get_online_cpus();

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	register_cpu_notifier(&foobar_cpu_notifier);

	put_online_cpus();

This is wrong, since it is prone to ABBA deadlocks involving the
cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently
with CPU hotplug operations).

Instead, the correct and race-free way of performing the callback
registration is:

	cpu_notifier_register_begin();

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	/* Note the use of the double underscored version of the API */
	__register_cpu_notifier(&foobar_cpu_notifier);

	cpu_notifier_register_done();

Fix the tracing ring-buffer code by using this latter form of callback
registration.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/trace/ring_buffer.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fc4da2d97f9b..c634868c2921 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1301,7 +1301,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	 * In that off case, we need to allocate for all possible cpus.
 	 */
 #ifdef CONFIG_HOTPLUG_CPU
-	get_online_cpus();
+	cpu_notifier_register_begin();
 	cpumask_copy(buffer->cpumask, cpu_online_mask);
 #else
 	cpumask_copy(buffer->cpumask, cpu_possible_mask);
@@ -1324,10 +1324,10 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 #ifdef CONFIG_HOTPLUG_CPU
 	buffer->cpu_notify.notifier_call = rb_cpu_notify;
 	buffer->cpu_notify.priority = 0;
-	register_cpu_notifier(&buffer->cpu_notify);
+	__register_cpu_notifier(&buffer->cpu_notify);
+	cpu_notifier_register_done();
 #endif
 
-	put_online_cpus();
 	mutex_init(&buffer->mutex);
 
 	return buffer;
@@ -1341,7 +1341,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
-	put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+	cpu_notifier_register_done();
+#endif
 
  fail_free_buffer:
 	kfree(buffer);
@@ -1358,16 +1360,17 @@ ring_buffer_free(struct ring_buffer *buffer)
 {
 	int cpu;
 
-	get_online_cpus();
-
 #ifdef CONFIG_HOTPLUG_CPU
-	unregister_cpu_notifier(&buffer->cpu_notify);
+	cpu_notifier_register_begin();
+	__unregister_cpu_notifier(&buffer->cpu_notify);
 #endif
 
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
-	put_online_cpus();
+#ifdef CONFIG_HOTPLUG_CPU
+	cpu_notifier_register_done();
+#endif
 
 	kfree(buffer->buffers);
 	free_cpumask_var(buffer->cpumask);
-- 
cgit v1.3-14-g43fede


From c270a817196a9374a2dc730624d1501dced40b4d Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Tue, 11 Mar 2014 02:12:08 +0530
Subject: profile: Fix CPU hotplug callback registration

Subsystems that want to register CPU hotplug callbacks, as well as perform
initialization for the CPUs that are already online, often do it as shown
below:

	get_online_cpus();

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	register_cpu_notifier(&foobar_cpu_notifier);

	put_online_cpus();

This is wrong, since it is prone to ABBA deadlocks involving the
cpu_add_remove_lock and the cpu_hotplug.lock (when running concurrently
with CPU hotplug operations).

Instead, the correct and race-free way of performing the callback
registration is:

	cpu_notifier_register_begin();

	for_each_online_cpu(cpu)
		init_cpu(cpu);

	/* Note the use of the double underscored version of the API */
	__register_cpu_notifier(&foobar_cpu_notifier);

	cpu_notifier_register_done();

Fix the profile code by using this latter form of callback registration.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/profile.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index ebdd9c1a86b4..93b2a3fe0a64 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -591,18 +591,28 @@ out_cleanup:
 int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
 {
 	struct proc_dir_entry *entry;
+	int err = 0;
 
 	if (!prof_on)
 		return 0;
-	if (create_hash_tables())
-		return -ENOMEM;
+
+	cpu_notifier_register_begin();
+
+	if (create_hash_tables()) {
+		err = -ENOMEM;
+		goto out;
+	}
+
 	entry = proc_create("profile", S_IWUSR | S_IRUGO,
 			    NULL, &proc_profile_operations);
 	if (!entry)
-		return 0;
+		goto out;
 	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
-	hotcpu_notifier(profile_cpu_callback, 0);
-	return 0;
+	__hotcpu_notifier(profile_cpu_callback, 0);
+
+out:
+	cpu_notifier_register_done();
+	return err;
 }
 module_init(create_proc_profile);
 #endif /* CONFIG_PROC_FS */
-- 
cgit v1.3-14-g43fede


From 3f1c82502c299da08b7b7f08b435212e51166ed9 Mon Sep 17 00:00:00 2001
From: William Roberts <bill.c.roberts@gmail.com>
Date: Tue, 11 Feb 2014 10:12:01 -0800
Subject: audit: Audit proc/<pid>/cmdline aka proctitle

During an audit event, cache and print the value of the process's
proctitle value (proc/<pid>/cmdline). This is useful in situations
where processes are started via fork'd virtual machines where the
comm field is incorrect. Often times, setting the comm field still
is insufficient as the comm width is not very wide and most
virtual machine "package names" do not fit. Also, during execution,
many threads have their comm field set as well. By tying it back to
the global cmdline value for the process, audit records will be more
complete in systems with these properties. An example of where this
is useful and applicable is in the realm of Android. With Android,
their is no fork/exec for VM instances. The bare, preloaded Dalvik
VM listens for a fork and specialize request. When this request comes
in, the VM forks, and the loads the specific application (specializing).
This was done to take advantage of COW and to not require a load of
basic packages by the VM on very app spawn. When this spawn occurs,
the package name is set via setproctitle() and shows up in procfs.
Many of these package names are longer then 16 bytes, the historical
width of task->comm. Having the cmdline in the audit records will
couple the application back to the record directly. Also, on my
Debian development box, some audit records were more useful then
what was printed under comm.

The cached proctitle is tied to the life-cycle of the audit_context
structure and is built on demand.

Proctitle is controllable by userspace, and thus should not be trusted.
It is meant as an aid to assist in debugging. The proctitle event is
emitted during syscall audits, and can be filtered with auditctl.

Example:
type=AVC msg=audit(1391217013.924:386): avc:  denied  { getattr } for  pid=1971 comm="mkdir" name="/" dev="selinuxfs" ino=1 scontext=system_u:system_r:consolekit_t:s0-s0:c0.c255 tcontext=system_u:object_r:security_t:s0 tclass=filesystem
type=SYSCALL msg=audit(1391217013.924:386): arch=c000003e syscall=137 success=yes exit=0 a0=7f019dfc8bd7 a1=7fffa6aed2c0 a2=fffffffffff4bd25 a3=7fffa6aed050 items=0 ppid=1967 pid=1971 auid=4294967295 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=4294967295 comm="mkdir" exe="/bin/mkdir" subj=system_u:system_r:consolekit_t:s0-s0:c0.c255 key=(null)
type=UNKNOWN[1327] msg=audit(1391217013.924:386):  proctitle=6D6B646972002D70002F7661722F72756E2F636F6E736F6C65

Acked-by: Steve Grubb <sgrubb@redhat.com> (wrt record formating)

Signed-off-by: William Roberts <wroberts@tresys.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/uapi/linux/audit.h |  1 +
 kernel/audit.h             |  6 +++++
 kernel/auditsc.c           | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'kernel')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 2d48fe1274ca..4315ee99b967 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -109,6 +109,7 @@
 #define AUDIT_NETFILTER_PKT	1324	/* Packets traversing netfilter chains */
 #define AUDIT_NETFILTER_CFG	1325	/* Netfilter chain modifications */
 #define AUDIT_SECCOMP		1326	/* Secure Computing event */
+#define AUDIT_PROCTITLE		1327	/* Proctitle emit event */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d67718..38c967d28de5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -106,6 +106,11 @@ struct audit_names {
 	bool			should_free;
 };
 
+struct audit_proctitle {
+	int	len;	/* length of the cmdline field. */
+	char	*value;	/* the cmdline field */
+};
+
 /* The per-task audit context. */
 struct audit_context {
 	int		    dummy;	/* must be the first element */
@@ -202,6 +207,7 @@ struct audit_context {
 		} execve;
 	};
 	int fds[2];
+	struct audit_proctitle proctitle;
 
 #if AUDIT_DEBUG
 	int		    put_count;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6874c1fd453d..043d1ef9362f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,7 @@
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
 #include <linux/compat.h>
+#include <linux/ctype.h>
 
 #include "audit.h"
 
@@ -81,6 +82,9 @@
 /* no execve audit message should be longer than this (userspace limits) */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
+/* max length to print of cmdline/proctitle value during audit */
+#define MAX_PROCTITLE_AUDIT_LEN 128
+
 /* number of audit rules */
 int audit_n_rules;
 
@@ -844,6 +848,13 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 	return context;
 }
 
+static inline void audit_proctitle_free(struct audit_context *context)
+{
+	kfree(context->proctitle.value);
+	context->proctitle.value = NULL;
+	context->proctitle.len = 0;
+}
+
 static inline void audit_free_names(struct audit_context *context)
 {
 	struct audit_names *n, *next;
@@ -956,6 +967,7 @@ static inline void audit_free_context(struct audit_context *context)
 	audit_free_aux(context);
 	kfree(context->filterkey);
 	kfree(context->sockaddr);
+	audit_proctitle_free(context);
 	kfree(context);
 }
 
@@ -1272,6 +1284,59 @@ static void show_special(struct audit_context *context, int *call_panic)
 	audit_log_end(ab);
 }
 
+static inline int audit_proctitle_rtrim(char *proctitle, int len)
+{
+	char *end = proctitle + len - 1;
+	while (end > proctitle && !isprint(*end))
+		end--;
+
+	/* catch the case where proctitle is only 1 non-print character */
+	len = end - proctitle + 1;
+	len -= isprint(proctitle[len-1]) == 0;
+	return len;
+}
+
+static void audit_log_proctitle(struct task_struct *tsk,
+			 struct audit_context *context)
+{
+	int res;
+	char *buf;
+	char *msg = "(null)";
+	int len = strlen(msg);
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
+	if (!ab)
+		return;	/* audit_panic or being filtered */
+
+	audit_log_format(ab, "proctitle=");
+
+	/* Not  cached */
+	if (!context->proctitle.value) {
+		buf = kmalloc(MAX_PROCTITLE_AUDIT_LEN, GFP_KERNEL);
+		if (!buf)
+			goto out;
+		/* Historically called this from procfs naming */
+		res = get_cmdline(tsk, buf, MAX_PROCTITLE_AUDIT_LEN);
+		if (res == 0) {
+			kfree(buf);
+			goto out;
+		}
+		res = audit_proctitle_rtrim(buf, res);
+		if (res == 0) {
+			kfree(buf);
+			goto out;
+		}
+		context->proctitle.value = buf;
+		context->proctitle.len = res;
+	}
+	msg = context->proctitle.value;
+	len = context->proctitle.len;
+out:
+	audit_log_n_untrustedstring(ab, msg, len);
+	audit_log_end(ab);
+}
+
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -1389,6 +1454,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 		audit_log_name(context, n, NULL, i++, &call_panic);
 	}
 
+	audit_log_proctitle(tsk, context);
+
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
 	if (ab)
-- 
cgit v1.3-14-g43fede


From 638a0fd2a062568c568661be0a780be8e8836d03 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2014 10:49:05 -0800
Subject: audit: Use struct net not pid_t to remember the network namespce to
 reply in

While reading through 3.14-rc1 I found a pretty siginficant mishandling
of network namespaces in the recent audit changes.

In struct audit_netlink_list and audit_reply add a reference to the
network namespace of the caller and remove the userspace pid of the
caller.  This cleanly remembers the callers network namespace, and
removes a huge class of races and nasty failure modes that can occur
when attempting to relook up the callers network namespace from a pid_t
(including the caller's network namespace changing, pid wraparound, and
the pid simply not being present).

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c       | 10 ++++++----
 kernel/audit.h       |  2 +-
 kernel/auditfilter.c |  3 ++-
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fbf..71fb41f393d7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
 
 struct audit_reply {
 	__u32 portid;
-	pid_t pid;
+	struct net *net;
 	struct sk_buff *skb;
 };
 
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
 {
 	struct audit_netlink_list *dest = _dest;
 	struct sk_buff *skb;
-	struct net *net = get_net_ns_by_pid(dest->pid);
+	struct net *net = dest->net;
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 
 	/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
 	while ((skb = __skb_dequeue(&dest->q)) != NULL)
 		netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
 
+	put_net(net);
 	kfree(dest);
 
 	return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
 static int audit_send_reply_thread(void *arg)
 {
 	struct audit_reply *reply = (struct audit_reply *)arg;
-	struct net *net = get_net_ns_by_pid(reply->pid);
+	struct net *net = reply->net;
 	struct audit_net *aunet = net_generic(net, audit_net_id);
 
 	mutex_lock(&audit_cmd_mutex);
@@ -552,6 +553,7 @@ static int audit_send_reply_thread(void *arg)
 	/* Ignore failure. It'll only happen if the sender goes away,
 	   because our timeout is set to infinite. */
 	netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+	put_net(net);
 	kfree(reply);
 	return 0;
 }
@@ -583,8 +585,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
 	if (!skb)
 		goto out;
 
+	reply->net = get_net(current->nsproxy->net_ns);
 	reply->portid = portid;
-	reply->pid = task_pid_vnr(current);
 	reply->skb = skb;
 
 	tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
diff --git a/kernel/audit.h b/kernel/audit.h
index 38c967d28de5..7bb65730c890 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -253,7 +253,7 @@ extern void		    audit_panic(const char *message);
 
 struct audit_netlink_list {
 	__u32 portid;
-	pid_t pid;
+	struct net *net;
 	struct sk_buff_head q;
 };
 
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 3152d1aea164..a0d470131fd0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -31,6 +31,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/security.h>
+#include <net/net_namespace.h>
 #include "audit.h"
 
 /*
@@ -1085,8 +1086,8 @@ int audit_list_rules_send(__u32 portid, int seq)
 	dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
 	if (!dest)
 		return -ENOMEM;
+	dest->net = get_net(current->nsproxy->net_ns);
 	dest->portid = portid;
-	dest->pid = task_pid_vnr(current);
 	skb_queue_head_init(&dest->q);
 
 	mutex_lock(&audit_filter_mutex);
-- 
cgit v1.3-14-g43fede


From 099dd235113700bbb476e572cd191ddb77b9af46 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 28 Feb 2014 20:36:55 -0800
Subject: audit: Send replies in the proper network namespace.

In perverse cases of file descriptor passing the current network
namespace of a process and the network namespace of a socket used by
that socket may differ.  Therefore use the network namespace of the
appropiate socket to ensure replies always go to the appropiate
socket.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  3 ++-
 kernel/audit.c        | 21 ++++++++++-----------
 kernel/auditfilter.c  |  7 +++++--
 3 files changed, 17 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index aa865a9a4c4f..ec1464df4c60 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -43,6 +43,7 @@ struct mq_attr;
 struct mqstat;
 struct audit_watch;
 struct audit_tree;
+struct sk_buff;
 
 struct audit_krule {
 	int			vers_ops;
@@ -463,7 +464,7 @@ extern int audit_filter_user(int type);
 extern int audit_filter_type(int type);
 extern int audit_rule_change(int type, __u32 portid, int seq,
 				void *data, size_t datasz);
-extern int audit_list_rules_send(__u32 portid, int seq);
+extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);
 
 extern u32 audit_enabled;
 #else /* CONFIG_AUDIT */
diff --git a/kernel/audit.c b/kernel/audit.c
index 71fb41f393d7..7b44bd47759c 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -570,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
  * Allocates an skb, builds the netlink message, and sends it to the port id.
  * No failure notifications.
  */
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
 			     int multi, const void *payload, int size)
 {
+	u32 portid = NETLINK_CB(request_skb).portid;
+	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
 	struct sk_buff *skb;
 	struct task_struct *tsk;
 	struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -585,7 +587,7 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
 	if (!skb)
 		goto out;
 
-	reply->net = get_net(current->nsproxy->net_ns);
+	reply->net = get_net(net);
 	reply->portid = portid;
 	reply->skb = skb;
 
@@ -675,8 +677,7 @@ static int audit_get_feature(struct sk_buff *skb)
 
 	seq = nlmsg_hdr(skb)->nlmsg_seq;
 
-	audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
-			 &af, sizeof(af));
+	audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
 
 	return 0;
 }
@@ -796,8 +797,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.backlog		= skb_queue_len(&audit_skb_queue);
 		s.version		= AUDIT_VERSION_LATEST;
 		s.backlog_wait_time	= audit_backlog_wait_time;
-		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
-				 &s, sizeof(s));
+		audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
 	case AUDIT_SET: {
@@ -907,7 +907,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 					   seq, data, nlmsg_len(nlh));
 		break;
 	case AUDIT_LIST_RULES:
-		err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+		err = audit_list_rules_send(skb, seq);
 		break;
 	case AUDIT_TRIM:
 		audit_trim_trees();
@@ -972,8 +972,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			memcpy(sig_data->ctx, ctx, len);
 			security_release_secctx(ctx, len);
 		}
-		audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
-				0, 0, sig_data, sizeof(*sig_data) + len);
+		audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+				 sig_data, sizeof(*sig_data) + len);
 		kfree(sig_data);
 		break;
 	case AUDIT_TTY_GET: {
@@ -985,8 +985,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		s.log_passwd = tsk->signal->audit_tty_log_passwd;
 		spin_unlock(&tsk->sighand->siglock);
 
-		audit_send_reply(NETLINK_CB(skb).portid, seq,
-				 AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+		audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
 		break;
 	}
 	case AUDIT_TTY_SET: {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a0d470131fd0..549bbb6e6597 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <net/net_namespace.h>
+#include <net/sock.h>
 #include "audit.h"
 
 /*
@@ -1071,8 +1072,10 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
  * @portid: target portid for netlink audit messages
  * @seq: netlink audit message sequence (serial) number
  */
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
 {
+	u32 portid = NETLINK_CB(request_skb).portid;
+	struct net *net = sock_net(NETLINK_CB(request_skb).sk);
 	struct task_struct *tsk;
 	struct audit_netlink_list *dest;
 	int err = 0;
@@ -1086,7 +1089,7 @@ int audit_list_rules_send(__u32 portid, int seq)
 	dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
 	if (!dest)
 		return -ENOMEM;
-	dest->net = get_net(current->nsproxy->net_ns);
+	dest->net = get_net(net);
 	dest->portid = portid;
 	skb_queue_head_init(&dest->q);
 
-- 
cgit v1.3-14-g43fede


From 4a3eb726d1543c4b616b9a0a4d4c53ddd276f5f4 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 18 Feb 2014 15:29:43 -0500
Subject: audit: rename the misleading audit_get_context() to
 audit_take_context()

"get" usually implies incrementing a refcount into a structure to indicate a
reference being held by another part of code.

Change this function name to indicate it is in fact being taken from it,
returning the value while clearing it in the supplying structure.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 043d1ef9362f..57bf178ca7d5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -811,7 +811,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 	rcu_read_unlock();
 }
 
-static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+/* Transfer the audit context pointer to the caller, clearing it in the tsk's struct */
+static inline struct audit_context *audit_take_context(struct task_struct *tsk,
 						      int return_valid,
 						      long return_code)
 {
@@ -1474,7 +1475,7 @@ void __audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
-	context = audit_get_context(tsk, 0, 0);
+	context = audit_take_context(tsk, 0, 0);
 	if (!context)
 		return;
 
@@ -1568,7 +1569,7 @@ void __audit_syscall_exit(int success, long return_code)
 	else
 		success = AUDITSC_FAILURE;
 
-	context = audit_get_context(tsk, success, return_code);
+	context = audit_take_context(tsk, success, return_code);
 	if (!context)
 		return;
 
-- 
cgit v1.3-14-g43fede


From c92cdeb45eea38515e82187f48c2e4f435fb4e25 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Tue, 10 Dec 2013 22:10:41 -0500
Subject: audit: convert PPIDs to the inital PID namespace.

sys_getppid() returns the parent pid of the current process in its own pid
namespace.  Since audit filters are based in the init pid namespace, a process
could avoid a filter or trigger an unintended one by being in an alternate pid
namespace or log meaningless information.

Switch to task_ppid_nr() for PPIDs to anchor all audit filters in the
init_pid_ns.

(informed by ebiederman's 6c621b7e)
Cc: stable@vger.kernel.org
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/audit.c   | 4 ++--
 kernel/auditsc.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 7b44bd47759c..e1e1b2137048 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1819,10 +1819,10 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 	spin_unlock_irq(&tsk->sighand->siglock);
 
 	audit_log_format(ab,
-			 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+			 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
 			 " euid=%u suid=%u fsuid=%u"
 			 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
-			 sys_getppid(),
+			 task_ppid_nr(tsk),
 			 tsk->pid,
 			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
 			 from_kuid(&init_user_ns, cred->uid),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 57bf178ca7d5..a6cf7ab56e61 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -465,7 +465,7 @@ static int audit_filter_rules(struct task_struct *tsk,
 		case AUDIT_PPID:
 			if (ctx) {
 				if (!ctx->ppid)
-					ctx->ppid = sys_getppid();
+					ctx->ppid = task_ppid_nr(tsk);
 				result = audit_comparator(ctx->ppid, f->op, f->val);
 			}
 			break;
-- 
cgit v1.3-14-g43fede


From f1dc4867ff41b7bcca57fa19449d1fe7ad517ac1 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Wed, 11 Dec 2013 13:52:26 -0500
Subject: audit: anchor all pid references in the initial pid namespace

Store and log all PIDs with reference to the initial PID namespace and
use the access functions task_pid_nr() and task_tgid_nr() for task->pid
and task->tgid.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
(informed by ebiederman's c776b5d2)
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 drivers/tty/tty_audit.c              |  3 ++-
 kernel/audit.c                       |  5 +++--
 kernel/auditfilter.c                 | 17 ++++++++++++++++-
 kernel/auditsc.c                     | 16 +++++++++-------
 security/integrity/integrity_audit.c |  2 +-
 security/lsm_audit.c                 | 11 +++++++----
 6 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index b0e540137e39..90ca082935f6 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -65,6 +65,7 @@ static void tty_audit_log(const char *description, int major, int minor,
 {
 	struct audit_buffer *ab;
 	struct task_struct *tsk = current;
+	pid_t pid = task_pid_nr(tsk);
 	uid_t uid = from_kuid(&init_user_ns, task_uid(tsk));
 	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(tsk));
 	unsigned int sessionid = audit_get_sessionid(tsk);
@@ -74,7 +75,7 @@ static void tty_audit_log(const char *description, int major, int minor,
 		char name[sizeof(tsk->comm)];
 
 		audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d"
-				 " minor=%d comm=", description, tsk->pid, uid,
+				 " minor=%d comm=", description, pid, uid,
 				 loginuid, sessionid, major, minor);
 		get_task_comm(name, tsk);
 		audit_log_untrustedstring(ab, name);
diff --git a/kernel/audit.c b/kernel/audit.c
index e1e1b2137048..5a096f8e28cb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -649,6 +649,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 {
 	int rc = 0;
 	uid_t uid = from_kuid(&init_user_ns, current_uid());
+	pid_t pid = task_tgid_nr(current);
 
 	if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
 		*ab = NULL;
@@ -658,7 +659,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 	if (unlikely(!*ab))
 		return rc;
-	audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
+	audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
 	audit_log_session_info(*ab);
 	audit_log_task_context(*ab);
 
@@ -1823,7 +1824,7 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 			 " euid=%u suid=%u fsuid=%u"
 			 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
 			 task_ppid_nr(tsk),
-			 tsk->pid,
+			 task_pid_nr(tsk),
 			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
 			 from_kuid(&init_user_ns, cred->uid),
 			 from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 549bbb6e6597..96c8a704f130 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -433,6 +433,19 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 			f->val = 0;
 		}
 
+		if ((f->type == AUDIT_PID) || (f->type == AUDIT_PPID)) {
+			struct pid *pid;
+			rcu_read_lock();
+			pid = find_vpid(f->val);
+			if (!pid) {
+				rcu_read_unlock();
+				err = -ESRCH;
+				goto exit_free;
+			}
+			f->val = pid_nr(pid);
+			rcu_read_unlock();
+		}
+
 		err = audit_field_valid(entry, f);
 		if (err)
 			goto exit_free;
@@ -1242,12 +1255,14 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
+		pid_t pid;
 		int result = 0;
 		u32 sid;
 
 		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(task_pid_vnr(current), f->op, f->val);
+			pid = task_pid_nr(current);
+			result = audit_comparator(pid, f->op, f->val);
 			break;
 		case AUDIT_UID:
 			result = audit_uid_comparator(current_uid(), f->op, f->uid);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a6cf7ab56e61..6381f25ac3d4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -457,10 +457,12 @@ static int audit_filter_rules(struct task_struct *tsk,
 		struct audit_field *f = &rule->fields[i];
 		struct audit_names *n;
 		int result = 0;
+		pid_t pid;
 
 		switch (f->type) {
 		case AUDIT_PID:
-			result = audit_comparator(tsk->pid, f->op, f->val);
+			pid = task_pid_nr(tsk);
+			result = audit_comparator(pid, f->op, f->val);
 			break;
 		case AUDIT_PPID:
 			if (ctx) {
@@ -2051,7 +2053,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	audit_log_format(ab, "pid=%d uid=%u"
 			 " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
 			 " res=%d",
-			 current->pid, uid,
+			 task_pid_nr(current), uid,
 			 oldloginuid, loginuid, oldsessionid, sessionid,
 			 !rc);
 	audit_log_end(ab);
@@ -2275,7 +2277,7 @@ void __audit_ptrace(struct task_struct *t)
 {
 	struct audit_context *context = current->audit_context;
 
-	context->target_pid = t->pid;
+	context->target_pid = task_pid_nr(t);
 	context->target_auid = audit_get_loginuid(t);
 	context->target_uid = task_uid(t);
 	context->target_sessionid = audit_get_sessionid(t);
@@ -2300,7 +2302,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 
 	if (audit_pid && t->tgid == audit_pid) {
 		if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
-			audit_sig_pid = tsk->pid;
+			audit_sig_pid = task_pid_nr(tsk);
 			if (uid_valid(tsk->loginuid))
 				audit_sig_uid = tsk->loginuid;
 			else
@@ -2314,7 +2316,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	/* optimize the common case by putting first signal recipient directly
 	 * in audit_context */
 	if (!ctx->target_pid) {
-		ctx->target_pid = t->tgid;
+		ctx->target_pid = task_tgid_nr(t);
 		ctx->target_auid = audit_get_loginuid(t);
 		ctx->target_uid = t_uid;
 		ctx->target_sessionid = audit_get_sessionid(t);
@@ -2335,7 +2337,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
 	}
 	BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
 
-	axp->target_pid[axp->pid_count] = t->tgid;
+	axp->target_pid[axp->pid_count] = task_tgid_nr(t);
 	axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
 	axp->target_uid[axp->pid_count] = t_uid;
 	axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
@@ -2435,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab)
 			 from_kgid(&init_user_ns, gid),
 			 sessionid);
 	audit_log_task_context(ab);
-	audit_log_format(ab, " pid=%d comm=", current->pid);
+	audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
 	audit_log_untrustedstring(ab, current->comm);
 	if (mm) {
 		down_read(&mm->mmap_sem);
diff --git a/security/integrity/integrity_audit.c b/security/integrity/integrity_audit.c
index d7efb30404aa..85253b584791 100644
--- a/security/integrity/integrity_audit.c
+++ b/security/integrity/integrity_audit.c
@@ -39,7 +39,7 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 
 	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
 	audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u",
-			 current->pid,
+			 task_pid_nr(current),
 			 from_kuid(&init_user_ns, current_cred()->uid),
 			 from_kuid(&init_user_ns, audit_get_loginuid(current)),
 			 audit_get_sessionid(current));
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
index 9a62045e6282..69fdf3bc765b 100644
--- a/security/lsm_audit.c
+++ b/security/lsm_audit.c
@@ -220,7 +220,7 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	 */
 	BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);
 
-	audit_log_format(ab, " pid=%d comm=", tsk->pid);
+	audit_log_format(ab, " pid=%d comm=", task_pid_nr(tsk));
 	audit_log_untrustedstring(ab, tsk->comm);
 
 	switch (a->type) {
@@ -278,9 +278,12 @@ static void dump_common_audit_data(struct audit_buffer *ab,
 	}
 	case LSM_AUDIT_DATA_TASK:
 		tsk = a->u.tsk;
-		if (tsk && tsk->pid) {
-			audit_log_format(ab, " pid=%d comm=", tsk->pid);
-			audit_log_untrustedstring(ab, tsk->comm);
+		if (tsk) {
+			pid_t pid = task_pid_nr(tsk);
+			if (pid) {
+				audit_log_format(ab, " pid=%d comm=", pid);
+				audit_log_untrustedstring(ab, tsk->comm);
+			}
 		}
 		break;
 	case LSM_AUDIT_DATA_NET:
-- 
cgit v1.3-14-g43fede


From 5a3cb3b6c3a07904bb66baf055b2eaf01198b1f9 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 16 Aug 2013 00:04:46 -0400
Subject: audit: allow user processes to log from another PID namespace

Still only permit the audit logging daemon and control to operate from the
initial PID namespace, but allow processes to log from another PID namespace.

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
(informed by ebiederman's c776b5d2)

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/audit.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5a096f8e28cb..72c6e1cd6ef5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -607,9 +607,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 {
 	int err = 0;
 
-	/* Only support the initial namespaces for now. */
-	if ((current_user_ns() != &init_user_ns) ||
-	    (task_active_pid_ns(current) != &init_pid_ns))
+	/* Only support initial user namespace for now. */
+	if ((current_user_ns() != &init_user_ns))
 		return -EPERM;
 
 	switch (msg_type) {
@@ -629,6 +628,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	case AUDIT_TTY_SET:
 	case AUDIT_TRIM:
 	case AUDIT_MAKE_EQUIV:
+		/* Only support auditd and auditctl in initial pid namespace
+		 * for now. */
+		if ((task_active_pid_ns(current) != &init_pid_ns))
+			return -EPERM;
+
 		if (!capable(CAP_AUDIT_CONTROL))
 			err = -EPERM;
 		break;
-- 
cgit v1.3-14-g43fede


From aa589a13b5d00d3c643ee4114d8cbc3addb4e99f Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 24 Feb 2014 12:31:11 -0500
Subject: audit: remove superfluous new- prefix in AUDIT_LOGIN messages

The new- prefix on ses and auid are un-necessary and break ausearch.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 6381f25ac3d4..61ac3cf53f1d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2051,7 +2051,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	if (!ab)
 		return;
 	audit_log_format(ab, "pid=%d uid=%u"
-			 " old-auid=%u new-auid=%u old-ses=%u new-ses=%u"
+			 " old-auid=%u auid=%u old-ses=%u ses=%u"
 			 " res=%d",
 			 task_pid_nr(current), uid,
 			 oldloginuid, loginuid, oldsessionid, sessionid,
-- 
cgit v1.3-14-g43fede


From ddfad8affdb73cc8df5890fef16d98d63ff3a6f0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 19 Jan 2011 19:22:35 -0500
Subject: audit: include subject in login records

The login uid change record does not include the selinux context of the
task logging in.  Add that information.

(Updated from 2011-01: RHBZ:670328 -- RGB)

Reported-by: Steve Grubb <sgrubb@redhat.com>
Acked-by: James Morris <jmorris@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 61ac3cf53f1d..bd3de52600ff 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2050,12 +2050,10 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
 	if (!ab)
 		return;
-	audit_log_format(ab, "pid=%d uid=%u"
-			 " old-auid=%u auid=%u old-ses=%u ses=%u"
-			 " res=%d",
-			 task_pid_nr(current), uid,
-			 oldloginuid, loginuid, oldsessionid, sessionid,
-			 !rc);
+	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+	audit_log_task_context(ab);
+	audit_log_format(ab, " old-auid=%u auid=%u old-ses=%u ses=%u res=%d",
+			 oldloginuid, loginuid, oldsessionid, sessionid, !rc);
 	audit_log_end(ab);
 }
 
-- 
cgit v1.3-14-g43fede


From f12835276c3182f2b998d93dfd60100cf4b60c05 Mon Sep 17 00:00:00 2001
From: Josh Boyer <jwboyer@fedoraproject.org>
Date: Wed, 5 Mar 2014 16:29:55 -0500
Subject: audit: remove stray newlines from audit_log_lost messages

Calling audit_log_lost with a \n in the format string leads to extra
newlines in dmesg.  That function will eventually call audit_panic which
uses pr_err with an explicit \n included.  Just make these calls match the
others that lack \n.

Reported-by: Jonathan Kamens <jik@kamens.brookline.ma.us>
Signed-off-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/audit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 72c6e1cd6ef5..c0696dcfed11 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -396,7 +396,7 @@ static void audit_printk_skb(struct sk_buff *skb)
 		if (printk_ratelimit())
 			pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
 		else
-			audit_log_lost("printk limit exceeded\n");
+			audit_log_lost("printk limit exceeded");
 	}
 
 	audit_hold_skb(skb);
@@ -412,7 +412,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
 		BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
 		if (audit_pid) {
 			pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
-			audit_log_lost("auditd disappeared\n");
+			audit_log_lost("auditd disappeared");
 			audit_pid = 0;
 			audit_sock = NULL;
 		}
-- 
cgit v1.3-14-g43fede


From b7550787fe8b5beffb5f56fa11a87712d699d085 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 5 Mar 2014 14:34:36 -0800
Subject: audit: remove stray newline from audit_log_execve_info()
 audit_panic() call

There's an unnecessary use of a \n in audit_panic.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bd3de52600ff..254ce2063d1d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1173,7 +1173,7 @@ static void audit_log_execve_info(struct audit_context *context,
 	 */
 	buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
 	if (!buf) {
-		audit_panic("out of memory for argv string\n");
+		audit_panic("out of memory for argv string");
 		return;
 	}
 
-- 
cgit v1.3-14-g43fede


From 5e937a9ae9137899c6641d718bd3820861099a09 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 11 Mar 2014 12:48:43 -0400
Subject: syscall_get_arch: remove useless function arguments

Every caller of syscall_get_arch() uses current for the task and no
implementors of the function need args.  So just get rid of both of
those things.  Admittedly, since these are inline functions we aren't
wasting stack space, but it just makes the prototypes better.

Signed-off-by: Eric Paris <eparis@redhat.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Cc: linux390@de.ibm.com
Cc: x86@kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: linux-s390@vger.kernel.org
Cc: linux-arch@vger.kernel.org
---
 arch/arm/include/asm/syscall.h  | 3 +--
 arch/mips/include/asm/syscall.h | 2 +-
 arch/mips/kernel/ptrace.c       | 2 +-
 arch/s390/include/asm/syscall.h | 5 ++---
 arch/x86/include/asm/syscall.h  | 8 +++-----
 include/asm-generic/syscall.h   | 4 +---
 kernel/seccomp.c                | 4 ++--
 7 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h
index 73ddd7239b33..ed805f1d3785 100644
--- a/arch/arm/include/asm/syscall.h
+++ b/arch/arm/include/asm/syscall.h
@@ -103,8 +103,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->ARM_r0 + i, args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(struct task_struct *task,
-				   struct pt_regs *regs)
+static inline int syscall_get_arch(void)
 {
 	/* ARM tasks don't change audit architectures on the fly. */
 	return AUDIT_ARCH_ARM;
diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h
index 81c89132c59d..625e709e81b9 100644
--- a/arch/mips/include/asm/syscall.h
+++ b/arch/mips/include/asm/syscall.h
@@ -101,7 +101,7 @@ extern const unsigned long sys_call_table[];
 extern const unsigned long sys32_call_table[];
 extern const unsigned long sysn32_call_table[];
 
-static inline int __syscall_get_arch(void)
+static inline int syscall_get_arch(void)
 {
 	int arch = EM_MIPS;
 #ifdef CONFIG_64BIT
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index b52e1d2b33e0..65ba622baf3e 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -671,7 +671,7 @@ asmlinkage void syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->regs[2]);
 
-	audit_syscall_entry(__syscall_get_arch(),
+	audit_syscall_entry(syscall_get_arch(),
 			    regs->regs[2],
 			    regs->regs[4], regs->regs[5],
 			    regs->regs[6], regs->regs[7]);
diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h
index cd29d2f4e4f3..bebc0bd8abc2 100644
--- a/arch/s390/include/asm/syscall.h
+++ b/arch/s390/include/asm/syscall.h
@@ -89,11 +89,10 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		regs->orig_gpr2 = args[0];
 }
 
-static inline int syscall_get_arch(struct task_struct *task,
-				   struct pt_regs *regs)
+static inline int syscall_get_arch(void)
 {
 #ifdef CONFIG_COMPAT
-	if (test_tsk_thread_flag(task, TIF_31BIT))
+	if (test_tsk_thread_flag(current, TIF_31BIT))
 		return AUDIT_ARCH_S390;
 #endif
 	return sizeof(long) == 8 ? AUDIT_ARCH_S390X : AUDIT_ARCH_S390;
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index aea284b41312..7e6d0c49ecab 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -91,8 +91,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 	memcpy(&regs->bx + i, args, n * sizeof(args[0]));
 }
 
-static inline int syscall_get_arch(struct task_struct *task,
-				   struct pt_regs *regs)
+static inline int syscall_get_arch(void)
 {
 	return AUDIT_ARCH_I386;
 }
@@ -221,8 +220,7 @@ static inline void syscall_set_arguments(struct task_struct *task,
 		}
 }
 
-static inline int syscall_get_arch(struct task_struct *task,
-				   struct pt_regs *regs)
+static inline int syscall_get_arch(void)
 {
 #ifdef CONFIG_IA32_EMULATION
 	/*
@@ -234,7 +232,7 @@ static inline int syscall_get_arch(struct task_struct *task,
 	 *
 	 * x32 tasks should be considered AUDIT_ARCH_X86_64.
 	 */
-	if (task_thread_info(task)->status & TS_COMPAT)
+	if (task_thread_info(current)->status & TS_COMPAT)
 		return AUDIT_ARCH_I386;
 #endif
 	/* Both x32 and x86_64 are considered "64-bit". */
diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h
index 5b09392db673..d401e5463fb0 100644
--- a/include/asm-generic/syscall.h
+++ b/include/asm-generic/syscall.h
@@ -144,8 +144,6 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
 
 /**
  * syscall_get_arch - return the AUDIT_ARCH for the current system call
- * @task:	task of interest, must be in system call entry tracing
- * @regs:	task_pt_regs() of @task
  *
  * Returns the AUDIT_ARCH_* based on the system call convention in use.
  *
@@ -155,5 +153,5 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs,
  * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must
  * provide an implementation of this.
  */
-int syscall_get_arch(struct task_struct *task, struct pt_regs *regs);
+int syscall_get_arch(void);
 #endif	/* _ASM_SYSCALL_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..eda2da3df822 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -95,7 +95,7 @@ u32 seccomp_bpf_load(int off)
 	if (off == BPF_DATA(nr))
 		return syscall_get_nr(current, regs);
 	if (off == BPF_DATA(arch))
-		return syscall_get_arch(current, regs);
+		return syscall_get_arch();
 	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
 		unsigned long value;
 		int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
@@ -351,7 +351,7 @@ static void seccomp_send_sigsys(int syscall, int reason)
 	info.si_code = SYS_SECCOMP;
 	info.si_call_addr = (void __user *)KSTK_EIP(current);
 	info.si_errno = reason;
-	info.si_arch = syscall_get_arch(current, task_pt_regs(current));
+	info.si_arch = syscall_get_arch();
 	info.si_syscall = syscall;
 	force_sig_info(SIGSYS, &info, current);
 }
-- 
cgit v1.3-14-g43fede


From e1b2dc176f2d5be7952c47a4e4e8f3b06a90db1c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Mar 2014 11:10:15 -0400
Subject: cgroup: break kernfs active_ref protection in cgroup directory
 operations

cgroup_tree_mutex should nest above the kernfs active_ref protection;
however, cgroup_create() and cgroup_rename() were grabbing
cgroup_tree_mutex while under kernfs active_ref protection.  This has
actualy possibility to lead to deadlocks in case these operations race
against cgroup_rmdir() which invokes kernfs_remove() on directory
kernfs_node while holding cgroup_tree_mutex.

Neither cgroup_create() or cgroup_rename() requires active_ref
protection.  The former already has enough synchronization through
cgroup_lock_live_group() and the latter doesn't care, so this can be
fixed by updating both functions to break all active_ref protections
before grabbing cgroup_tree_mutex.

While this patch fixes the immediate issue, it probably needs further
work in the long term - kernfs directories should enable lockdep
annotations and maybe the better way to handle this is marking
directory nodes as not needing active_ref protection rather than
breaking it in each operation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 kernel/cgroup.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 98a8045e2149..58c67b3060b5 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2324,6 +2324,14 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	if (cgroup_sane_behavior(cgrp))
 		return -EPERM;
 
+	/*
+	 * We're gonna grab cgroup_tree_mutex which nests outside kernfs
+	 * active_ref.  kernfs_rename() doesn't require active_ref
+	 * protection.  Break them before grabbing cgroup_tree_mutex.
+	 */
+	kernfs_break_active_protection(new_parent);
+	kernfs_break_active_protection(kn);
+
 	mutex_lock(&cgroup_tree_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -2331,6 +2339,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgroup_tree_mutex);
+
+	kernfs_unbreak_active_protection(kn);
+	kernfs_unbreak_active_protection(new_parent);
 	return ret;
 }
 
@@ -3778,8 +3789,22 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 			umode_t mode)
 {
 	struct cgroup *parent = parent_kn->priv;
+	int ret;
 
-	return cgroup_create(parent, name, mode);
+	/*
+	 * cgroup_create() grabs cgroup_tree_mutex which nests outside
+	 * kernfs active_ref and cgroup_create() already synchronizes
+	 * properly against removal through cgroup_lock_live_group().
+	 * Break it before calling cgroup_create().
+	 */
+	cgroup_get(parent);
+	kernfs_break_active_protection(parent_kn);
+
+	ret = cgroup_create(parent, name, mode);
+
+	kernfs_unbreak_active_protection(parent_kn);
+	cgroup_put(parent);
+	return ret;
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 87291347c49dc40aa339f587b209618201c2e527 Mon Sep 17 00:00:00 2001
From: Vaibhav Nagarnaik <vnagarnaik@google.com>
Date: Thu, 13 Feb 2014 19:51:48 -0800
Subject: tracing: Fix array size mismatch in format string

In event format strings, the array size is reported in two locations.
One in array subscript and then via the "size:" attribute. The values
reported there have a mismatch.

For e.g., in sched:sched_switch the prev_comm and next_comm character
arrays have subscript values as [32] where as the actual field size is
16.

name: sched_switch
ID: 301
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1;signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:char prev_comm[32];       offset:8;       size:16;        signed:1;
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[32];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

After bisection, the following commit was blamed:
92edca0 tracing: Use direct field, type and system names

This commit removes the duplication of strings for field->name and
field->type assuming that all the strings passed in
__trace_define_field() are immutable. This is not true for arrays, where
the type string is created in event_storage variable and field->type for
all array fields points to event_storage.

Use __stringify() to create a string constant for the type string.

Also, get rid of event_storage and event_storage_mutex that are not
needed anymore.

also, an added benefit is that this reduces the overhead of events a bit more:

   text    data     bss     dec     hex filename
8424787 2036472 1302528 11763787         b3804b vmlinux
8420814 2036408 1302528 11759750         b37086 vmlinux.patched

Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com

Cc: Laurent Chavey <chavey@google.com>
Cc: stable@vger.kernel.org # 3.10+
Signed-off-by: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 4 ----
 include/trace/ftrace.h       | 7 ++-----
 kernel/trace/trace_events.c  | 6 ------
 kernel/trace/trace_export.c  | 7 ++-----
 4 files changed, 4 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4e4cc28623ad..4cdb3a17bcb5 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -495,10 +495,6 @@ enum {
 	FILTER_TRACE_FN,
 };
 
-#define EVENT_STORAGE_SIZE 128
-extern struct mutex event_storage_mutex;
-extern char event_storage[EVENT_STORAGE_SIZE];
-
 extern int trace_event_raw_init(struct ftrace_event_call *call);
 extern int trace_define_field(struct ftrace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1a8b28db3775..1ee19a24cc5f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -310,15 +310,12 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 #undef __array
 #define __array(type, item, len)					\
 	do {								\
-		mutex_lock(&event_storage_mutex);			\
+		char *type_str = #type"["__stringify(len)"]";		\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		snprintf(event_storage, sizeof(event_storage),		\
-			 "%s[%d]", #type, len);				\
-		ret = trace_define_field(event_call, event_storage, #item, \
+		ret = trace_define_field(event_call, type_str, #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item),			\
 				 is_signed_type(type), FILTER_OTHER);	\
-		mutex_unlock(&event_storage_mutex);			\
 		if (ret)						\
 			return ret;					\
 	} while (0);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f3989ceb5cd5..7b16d40bd64d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
 
 DEFINE_MUTEX(event_mutex);
 
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
-
 LIST_HEAD(ftrace_events);
 static LIST_HEAD(ftrace_common_fields);
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b6..ee0a5098ac43 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #undef __array
 #define __array(type, item, len)					\
 	do {								\
+		char *type_str = #type"["__stringify(len)"]";		\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		mutex_lock(&event_storage_mutex);			\
-		snprintf(event_storage, sizeof(event_storage),		\
-			 "%s[%d]", #type, len);				\
-		ret = trace_define_field(event_call, event_storage, #item, \
+		ret = trace_define_field(event_call, type_str, #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item),			\
 				 is_signed_type(type), filter_type);	\
-		mutex_unlock(&event_storage_mutex);			\
 		if (ret)						\
 			return ret;					\
 	} while (0);
-- 
cgit v1.3-14-g43fede


From 8c90487cdc64847b4fdd812ab3047f426fec4d13 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 26 Feb 2014 10:49:49 -0500
Subject: Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC

Rename TAINT_UNSAFE_SMP to TAINT_CPU_OUT_OF_SPEC, so we can repurpose
the flag to encompass a wider range of pushing the CPU beyond its
warrany.

Signed-off-by: Dave Jones <davej@fedoraproject.org>
Link: http://lkml.kernel.org/r/20140226154949.GA770@redhat.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/amd.c | 2 +-
 include/linux/kernel.h    | 2 +-
 kernel/module.c           | 2 +-
 kernel/panic.c            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index b85e43a5a462..ce8b8ff0e0ef 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -218,7 +218,7 @@ static void amd_k7_smp_check(struct cpuinfo_x86 *c)
 	 */
 	WARN_ONCE(1, "WARNING: This combination of AMD"
 		" processors is not suitable for SMP.\n");
-	add_taint(TAINT_UNSAFE_SMP, LOCKDEP_NOW_UNRELIABLE);
+	add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_NOW_UNRELIABLE);
 }
 
 static void init_amd_k7(struct cpuinfo_x86 *c)
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 196d1ea86df0..08fb02477641 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -458,7 +458,7 @@ extern enum system_states {
 
 #define TAINT_PROPRIETARY_MODULE	0
 #define TAINT_FORCED_MODULE		1
-#define TAINT_UNSAFE_SMP		2
+#define TAINT_CPU_OUT_OF_SPEC		2
 #define TAINT_FORCED_RMMOD		3
 #define TAINT_MACHINE_CHECK		4
 #define TAINT_BAD_PAGE			5
diff --git a/kernel/module.c b/kernel/module.c
index d24fcf29cb64..ca2c1aded7ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1015,7 +1015,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
 		buf[l++] = 'C';
 	/*
 	 * TAINT_FORCED_RMMOD: could be added.
-	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+	 * TAINT_CPU_OUT_OF_SPEC, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
 	 * apply to modules.
 	 */
 	return l;
diff --git a/kernel/panic.c b/kernel/panic.c
index 6d6300375090..2270cfd1d6be 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -199,7 +199,7 @@ struct tnt {
 static const struct tnt tnts[] = {
 	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' },
 	{ TAINT_FORCED_MODULE,		'F', ' ' },
-	{ TAINT_UNSAFE_SMP,		'S', ' ' },
+	{ TAINT_CPU_OUT_OF_SPEC,	'S', ' ' },
 	{ TAINT_FORCED_RMMOD,		'R', ' ' },
 	{ TAINT_MACHINE_CHECK,		'M', ' ' },
 	{ TAINT_BAD_PAGE,		'B', ' ' },
-- 
cgit v1.3-14-g43fede


From 765a3f4fed708ae429ee095914a7897acb3a65bd Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 14 Mar 2014 16:37:08 -0700
Subject: rcu: Provide grace-period piggybacking API

The following pattern is currently not well supported by RCU:

1.	Make data element inaccessible to RCU readers.

2.	Do work that probably lasts for more than one grace period.

3.	Do something to make sure RCU readers in flight before #1 above
	have completed.

Here are some things that could currently be done:

a.	Do a synchronize_rcu() unconditionally at either #1 or #3 above.
	This works, but imposes needless work and latency.

b.	Post an RCU callback at #1 above that does a wakeup, then
	wait for the wakeup at #3.  This works well, but likely results
	in an extra unneeded grace period.  Open-coding this is also
	a bit more semi-tricky code than would be good.

This commit therefore adds get_state_synchronize_rcu() and
cond_synchronize_rcu() APIs.  Call get_state_synchronize_rcu() at #1
above and pass its return value to cond_synchronize_rcu() at #3 above.
This results in a call to synchronize_rcu() if no grace period has
elapsed between #1 and #3, but requires only a load, comparison, and
memory barrier if a full grace period did elapse.

Requested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/rcutiny.h | 10 ++++++++
 include/linux/rcutree.h |  2 ++
 kernel/rcu/tree.c       | 62 +++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 70 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e8cb6e3b52a7..425c659d54e5 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,6 +27,16 @@
 
 #include <linux/cache.h>
 
+static inline unsigned long get_state_synchronize_rcu(void)
+{
+	return 0;
+}
+
+static inline void cond_synchronize_rcu(unsigned long oldstate)
+{
+	might_sleep();
+}
+
 static inline void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index e9c63884df0a..a59ca05fd4e3 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -76,6 +76,8 @@ static inline void synchronize_rcu_bh_expedited(void)
 void rcu_barrier(void);
 void rcu_barrier_bh(void);
 void rcu_barrier_sched(void);
+unsigned long get_state_synchronize_rcu(void);
+void cond_synchronize_rcu(unsigned long oldstate);
 
 extern unsigned long rcutorture_testseq;
 extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 351faba48b91..0c47e300210a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1421,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
 	/* Advance to a new grace period and initialize state. */
 	record_gp_stall_check_time(rsp);
-	smp_wmb(); /* Record GP times before starting GP. */
-	rsp->gpnum++;
+	/* Record GP times before starting GP, hence smp_store_release(). */
+	smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
 	trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
 	raw_spin_unlock_irq(&rnp->lock);
 
 	/* Exclude any concurrent CPU-hotplug operations. */
 	mutex_lock(&rsp->onoff_mutex);
+	smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
 
 	/*
 	 * Set the quiescent-state-needed bits in all the rcu_node
@@ -1555,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	}
 	rnp = rcu_get_root(rsp);
 	raw_spin_lock_irq(&rnp->lock);
-	smp_mb__after_unlock_lock();
+	smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
 	rcu_nocb_gp_set(rnp, nocb);
 
-	rsp->completed = rsp->gpnum; /* Declare grace period done. */
+	/* Declare grace period done. */
+	ACCESS_ONCE(rsp->completed) = rsp->gpnum;
 	trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
 	rsp->fqs_state = RCU_GP_IDLE;
 	rdp = this_cpu_ptr(rsp->rda);
@@ -2637,6 +2639,58 @@ void synchronize_rcu_bh(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+	/*
+	 * Any prior manipulation of RCU-protected data must happen
+	 * before the load from ->gpnum.
+	 */
+	smp_mb();  /* ^^^ */
+
+	/*
+	 * Make sure this load happens before the purportedly
+	 * time-consuming work between get_state_synchronize_rcu()
+	 * and cond_synchronize_rcu().
+	 */
+	return smp_load_acquire(&rcu_state->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return.  Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account.  But
+ * counter wrap is harmless.  If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+	unsigned long newstate;
+
+	/*
+	 * Ensure that this load happens before any RCU-destructive
+	 * actions the caller might carry out after we return.
+	 */
+	newstate = smp_load_acquire(&rcu_state->completed);
+	if (ULONG_CMP_GE(oldstate, newstate))
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
 	/*
-- 
cgit v1.3-14-g43fede


From 11d4616bd07f38d496bd489ed8fad1dc4d928823 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 20 Mar 2014 22:11:17 -0700
Subject: futex: revert back to the explicit waiter counting code

Srikar Dronamraju reports that commit b0c29f79ecea ("futexes: Avoid
taking the hb->lock if there's nothing to wake up") causes java threads
getting stuck on futexes when runing specjbb on a power7 numa box.

The cause appears to be that the powerpc spinlocks aren't using the same
ticket lock model that we use on x86 (and other) architectures, which in
turn result in the "spin_is_locked()" test in hb_waiters_pending()
occasionally reporting an unlocked spinlock even when there are pending
waiters.

So this reinstates Davidlohr Bueso's original explicit waiter counting
code, which I had convinced Davidlohr to drop in favor of figuring out
the pending waiters by just using the existing state of the spinlock and
the wait queue.

Reported-and-tested-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Original-code-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 53 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9ff..08ec814ad9d2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -234,6 +234,7 @@ static const struct futex_q futex_q_init = {
  * waiting on a futex.
  */
 struct futex_hash_bucket {
+	atomic_t waiters;
 	spinlock_t lock;
 	struct plist_head chain;
 } ____cacheline_aligned_in_smp;
@@ -253,22 +254,37 @@ static inline void futex_get_mm(union futex_key *key)
 	smp_mb__after_atomic_inc();
 }
 
-static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
 {
 #ifdef CONFIG_SMP
+	atomic_inc(&hb->waiters);
 	/*
-	 * Tasks trying to enter the critical region are most likely
-	 * potential waiters that will be added to the plist. Ensure
-	 * that wakers won't miss to-be-slept tasks in the window between
-	 * the wait call and the actual plist_add.
+	 * Full barrier (A), see the ordering comment above.
 	 */
-	if (spin_is_locked(&hb->lock))
-		return true;
-	smp_rmb(); /* Make sure we check the lock state first */
+	smp_mb__after_atomic_inc();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+	atomic_dec(&hb->waiters);
+#endif
+}
 
-	return !plist_head_empty(&hb->chain);
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+	return atomic_read(&hb->waiters);
 #else
-	return true;
+	return 1;
 #endif
 }
 
@@ -954,6 +970,7 @@ static void __unqueue_futex(struct futex_q *q)
 
 	hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
 	plist_del(&q->list, &hb->chain);
+	hb_waiters_dec(hb);
 }
 
 /*
@@ -1257,7 +1274,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	 */
 	if (likely(&hb1->chain != &hb2->chain)) {
 		plist_del(&q->list, &hb1->chain);
+		hb_waiters_dec(hb1);
 		plist_add(&q->list, &hb2->chain);
+		hb_waiters_inc(hb2);
 		q->lock_ptr = &hb2->lock;
 	}
 	get_futex_key_refs(key2);
@@ -1600,6 +1619,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 	struct futex_hash_bucket *hb;
 
 	hb = hash_futex(&q->key);
+
+	/*
+	 * Increment the counter before taking the lock so that
+	 * a potential waker won't miss a to-be-slept task that is
+	 * waiting for the spinlock. This is safe as all queue_lock()
+	 * users end up calling queue_me(). Similarly, for housekeeping,
+	 * decrement the counter at queue_unlock() when some error has
+	 * occurred and we don't end up adding the task to the list.
+	 */
+	hb_waiters_inc(hb);
+
 	q->lock_ptr = &hb->lock;
 
 	spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1641,7 @@ queue_unlock(struct futex_hash_bucket *hb)
 	__releases(&hb->lock)
 {
 	spin_unlock(&hb->lock);
+	hb_waiters_dec(hb);
 }
 
 /**
@@ -2342,6 +2373,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 * Unqueue the futex_q and determine which it was.
 		 */
 		plist_del(&q->list, &hb->chain);
+		hb_waiters_dec(hb);
 
 		/* Handle spurious wakeups gracefully */
 		ret = -EWOULDBLOCK;
@@ -2875,6 +2907,7 @@ static int __init futex_init(void)
 		futex_cmpxchg_enabled = 1;
 
 	for (i = 0; i < futex_hashsize; i++) {
+		atomic_set(&futex_queues[i].waiters, 0);
 		plist_head_init(&futex_queues[i].chain);
 		spin_lock_init(&futex_queues[i].lock);
 	}
-- 
cgit v1.3-14-g43fede


From bc4c426ee2431d1f717004d3bbaacbd819b544fd Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 21 Mar 2014 08:23:38 -0400
Subject: Revert "tracing: Move event storage for array from macro to
 standalone function"

I originally wrote commit 35bb4399bd0e to shrink the size of the overhead of
tracepoints by several kilobytes. Later, I received a patch from Vaibhav
Nagarnaik that fixed a bug in the same code that this commit touches. Not
only did it fix a bug, it also removed code and shrunk the size of the
overhead of trace events even more than this commit did.

Since this commit is scheduled for 3.15 and Vaibhav's patch is already in
mainline, I need to revert this patch in order to keep it from conflicting
with Vaibhav's patch. Not to mention, Vaibhav's patch makes this patch
obsolete.

Link: http://lkml.kernel.org/r/20140320225637.0226041b@gandalf.local.home

Cc: Vaibhav Nagarnaik <vnagarnaik@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  8 ++++++++
 include/trace/ftrace.h       | 12 ++++++++----
 kernel/trace/trace_events.c  |  6 ++++++
 kernel/trace/trace_export.c  | 12 ++++++++----
 kernel/trace/trace_output.c  | 21 ---------------------
 5 files changed, 30 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 9d3fe0658398..cdc975929d15 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -221,6 +221,10 @@ void *ftrace_event_buffer_reserve(struct ftrace_event_buffer *fbuffer,
 
 void ftrace_event_buffer_commit(struct ftrace_event_buffer *fbuffer);
 
+int ftrace_event_define_field(struct ftrace_event_call *call,
+			      char *type, int len, char *item, int offset,
+			      int field_size, int sign, int filter);
+
 enum {
 	TRACE_EVENT_FL_FILTERED_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
@@ -519,6 +523,10 @@ enum {
 	FILTER_TRACE_FN,
 };
 
+#define EVENT_STORAGE_SIZE 128
+extern struct mutex event_storage_mutex;
+extern char event_storage[EVENT_STORAGE_SIZE];
+
 extern int trace_event_raw_init(struct ftrace_event_call *call);
 extern int trace_define_field(struct ftrace_event_call *call, const char *type,
 			      const char *name, int offset, int size,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index e15ae4010d51..d1d91875faa5 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -302,11 +302,15 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = {	\
 #undef __array
 #define __array(type, item, len)					\
 	do {								\
+		mutex_lock(&event_storage_mutex);			\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		ret = ftrace_event_define_field(event_call, #type, len,	\
-				#item, offsetof(typeof(field), item),   \
-				sizeof(field.item),			\
-			 	is_signed_type(type), FILTER_OTHER); \
+		snprintf(event_storage, sizeof(event_storage),		\
+			 "%s[%d]", #type, len);				\
+		ret = trace_define_field(event_call, event_storage, #item, \
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item),			\
+				 is_signed_type(type), FILTER_OTHER);	\
+		mutex_unlock(&event_storage_mutex);			\
 		if (ret)						\
 			return ret;					\
 	} while (0);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b8f73b333a3c..2f7b8e31e3a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,6 +27,12 @@
 
 DEFINE_MUTEX(event_mutex);
 
+DEFINE_MUTEX(event_storage_mutex);
+EXPORT_SYMBOL_GPL(event_storage_mutex);
+
+char event_storage[EVENT_STORAGE_SIZE];
+EXPORT_SYMBOL_GPL(event_storage);
+
 LIST_HEAD(ftrace_events);
 static LIST_HEAD(ftrace_common_fields);
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 39c746c5ae73..7c3e3e72e2b6 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -96,10 +96,14 @@ static void __always_unused ____ftrace_check_##name(void)		\
 #define __array(type, item, len)					\
 	do {								\
 		BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);			\
-		ret = ftrace_event_define_field(event_call, #type, len,	\
-				#item, offsetof(typeof(field), item),   \
-				sizeof(field.item),			\
-			 	is_signed_type(type), filter_type);	\
+		mutex_lock(&event_storage_mutex);			\
+		snprintf(event_storage, sizeof(event_storage),		\
+			 "%s[%d]", #type, len);				\
+		ret = trace_define_field(event_call, event_storage, #item, \
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item),			\
+				 is_signed_type(type), filter_type);	\
+		mutex_unlock(&event_storage_mutex);			\
 		if (ret)						\
 			return ret;					\
 	} while (0);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ee8d74840b88..ca0e79e2abaa 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,10 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
-#define EVENT_STORAGE_SIZE 128
-static DEFINE_MUTEX(event_storage_mutex);
-static char event_storage[EVENT_STORAGE_SIZE];
-
 int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
 	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
@@ -474,23 +470,6 @@ int ftrace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(ftrace_output_call);
 
-int ftrace_event_define_field(struct ftrace_event_call *call,
-			      char *type, int len, char *item, int offset,
-			      int field_size, int sign, int filter)
-{
-	int ret;
-
-	mutex_lock(&event_storage_mutex);
-	snprintf(event_storage, sizeof(event_storage),
-		 "%s[%d]", type, len);
-	ret = trace_define_field(call, event_storage, item, offset,
-				 field_size, sign, filter);
-	mutex_unlock(&event_storage_mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(ftrace_event_define_field);
-
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit v1.3-14-g43fede


From 0dea6d52638b2693b18cd2ed8938b236e0789ddb Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 21 Mar 2014 01:19:01 -0400
Subject: tracepoint: Remove unused API functions

After the following commit:

commit b75ef8b44b1cb95f5a26484b0e2fe37a63b12b44
Author: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date:   Wed Aug 10 15:18:39 2011 -0400

    Tracepoint: Dissociate from module mutex

The following functions became unnecessary:

- tracepoint_probe_register_noupdate,
- tracepoint_probe_unregister_noupdate,
- tracepoint_probe_update_all.

In fact, none of the in-kernel tracers, nor LTTng, nor SystemTAP use
them. Remove those.

Moreover, the functions:

- tracepoint_iter_start,
- tracepoint_iter_next,
- tracepoint_iter_stop,
- tracepoint_iter_reset.

are unused by in-kernel tracers, LTTng and SystemTAP. Remove those too.

Link: http://lkml.kernel.org/r/1395379142-2118-2-git-send-email-mathieu.desnoyers@efficios.com

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h |  18 ----
 kernel/tracepoint.c        | 222 +--------------------------------------------
 2 files changed, 5 insertions(+), 235 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index accc497f8d72..a3b2837d8dd1 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -48,12 +48,6 @@ extern int tracepoint_probe_register(const char *name, void *probe, void *data);
 extern int
 tracepoint_probe_unregister(const char *name, void *probe, void *data);
 
-extern int tracepoint_probe_register_noupdate(const char *name, void *probe,
-					      void *data);
-extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
-						void *data);
-extern void tracepoint_probe_update_all(void);
-
 #ifdef CONFIG_MODULES
 struct tp_module {
 	struct list_head list;
@@ -62,18 +56,6 @@ struct tp_module {
 };
 #endif /* CONFIG_MODULES */
 
-struct tracepoint_iter {
-#ifdef CONFIG_MODULES
-	struct tp_module *module;
-#endif /* CONFIG_MODULES */
-	struct tracepoint * const *tracepoint;
-};
-
-extern void tracepoint_iter_start(struct tracepoint_iter *iter);
-extern void tracepoint_iter_next(struct tracepoint_iter *iter);
-extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
-extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-
 /*
  * tracepoint_synchronize_unregister must be called between the last tracepoint
  * probe unregistration and the end of module exit to make sure there is no
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e2a58a22b0f4..65d9f9459a75 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -67,10 +67,7 @@ struct tracepoint_entry {
 };
 
 struct tp_probes {
-	union {
-		struct rcu_head rcu;
-		struct list_head list;
-	} u;
+	struct rcu_head rcu;
 	struct tracepoint_func probes[0];
 };
 
@@ -83,7 +80,7 @@ static inline void *allocate_probes(int count)
 
 static void rcu_free_old_probes(struct rcu_head *head)
 {
-	kfree(container_of(head, struct tp_probes, u.rcu));
+	kfree(container_of(head, struct tp_probes, rcu));
 }
 
 static inline void release_probes(struct tracepoint_func *old)
@@ -91,7 +88,7 @@ static inline void release_probes(struct tracepoint_func *old)
 	if (old) {
 		struct tp_probes *tp_probes = container_of(old,
 			struct tp_probes, probes[0]);
-		call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
+		call_rcu_sched(&tp_probes->rcu, rcu_free_old_probes);
 	}
 }
 
@@ -459,204 +456,11 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
-static LIST_HEAD(old_probes);
-static int need_update;
-
-static void tracepoint_add_old_probes(void *old)
-{
-	need_update = 1;
-	if (old) {
-		struct tp_probes *tp_probes = container_of(old,
-			struct tp_probes, probes[0]);
-		list_add(&tp_probes->u.list, &old_probes);
-	}
-}
-
-/**
- * tracepoint_probe_register_noupdate -  register a probe but not connect
- * @name: tracepoint name
- * @probe: probe handler
- * @data: probe private data
- *
- * caller must call tracepoint_probe_update_all()
- */
-int tracepoint_probe_register_noupdate(const char *name, void *probe,
-				       void *data)
-{
-	struct tracepoint_func *old;
-
-	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_add_probe(name, probe, data);
-	if (IS_ERR(old)) {
-		mutex_unlock(&tracepoints_mutex);
-		return PTR_ERR(old);
-	}
-	tracepoint_add_old_probes(old);
-	mutex_unlock(&tracepoints_mutex);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
-
-/**
- * tracepoint_probe_unregister_noupdate -  remove a probe but not disconnect
- * @name: tracepoint name
- * @probe: probe function pointer
- * @data: probe private data
- *
- * caller must call tracepoint_probe_update_all()
- */
-int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
-					 void *data)
-{
-	struct tracepoint_func *old;
-
-	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_remove_probe(name, probe, data);
-	if (IS_ERR(old)) {
-		mutex_unlock(&tracepoints_mutex);
-		return PTR_ERR(old);
-	}
-	tracepoint_add_old_probes(old);
-	mutex_unlock(&tracepoints_mutex);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
-
-/**
- * tracepoint_probe_update_all -  update tracepoints
- */
-void tracepoint_probe_update_all(void)
-{
-	LIST_HEAD(release_probes);
-	struct tp_probes *pos, *next;
-
-	mutex_lock(&tracepoints_mutex);
-	if (!need_update) {
-		mutex_unlock(&tracepoints_mutex);
-		return;
-	}
-	if (!list_empty(&old_probes))
-		list_replace_init(&old_probes, &release_probes);
-	need_update = 0;
-	tracepoint_update_probes();
-	mutex_unlock(&tracepoints_mutex);
-	list_for_each_entry_safe(pos, next, &release_probes, u.list) {
-		list_del(&pos->u.list);
-		call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
-	}
-}
-EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
-
-/**
- * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
- * @tracepoint: current tracepoints (in), next tracepoint (out)
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Returns whether a next tracepoint has been found (1) or not (0).
- * Will return the first tracepoint in the range if the input tracepoint is
- * NULL.
- */
-static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
-	struct tracepoint * const *begin, struct tracepoint * const *end)
-{
-	if (!*tracepoint && begin != end) {
-		*tracepoint = begin;
-		return 1;
-	}
-	if (*tracepoint >= begin && *tracepoint < end)
-		return 1;
-	return 0;
-}
-
-#ifdef CONFIG_MODULES
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
-	int found = 0;
-	struct tp_module *iter_mod;
-
-	/* Core kernel tracepoints */
-	if (!iter->module) {
-		found = tracepoint_get_iter_range(&iter->tracepoint,
-				__start___tracepoints_ptrs,
-				__stop___tracepoints_ptrs);
-		if (found)
-			goto end;
-	}
-	/* Tracepoints in modules */
-	mutex_lock(&tracepoints_mutex);
-	list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
-		/*
-		 * Sorted module list
-		 */
-		if (iter_mod < iter->module)
-			continue;
-		else if (iter_mod > iter->module)
-			iter->tracepoint = NULL;
-		found = tracepoint_get_iter_range(&iter->tracepoint,
-			iter_mod->tracepoints_ptrs,
-			iter_mod->tracepoints_ptrs
-				+ iter_mod->num_tracepoints);
-		if (found) {
-			iter->module = iter_mod;
-			break;
-		}
-	}
-	mutex_unlock(&tracepoints_mutex);
-end:
-	if (!found)
-		tracepoint_iter_reset(iter);
-}
-#else /* CONFIG_MODULES */
-static void tracepoint_get_iter(struct tracepoint_iter *iter)
-{
-	int found = 0;
-
-	/* Core kernel tracepoints */
-	found = tracepoint_get_iter_range(&iter->tracepoint,
-			__start___tracepoints_ptrs,
-			__stop___tracepoints_ptrs);
-	if (!found)
-		tracepoint_iter_reset(iter);
-}
-#endif /* CONFIG_MODULES */
-
-void tracepoint_iter_start(struct tracepoint_iter *iter)
-{
-	tracepoint_get_iter(iter);
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_start);
-
-void tracepoint_iter_next(struct tracepoint_iter *iter)
-{
-	iter->tracepoint++;
-	/*
-	 * iter->tracepoint may be invalid because we blindly incremented it.
-	 * Make sure it is valid by marshalling on the tracepoints, getting the
-	 * tracepoints from following modules if necessary.
-	 */
-	tracepoint_get_iter(iter);
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_next);
-
-void tracepoint_iter_stop(struct tracepoint_iter *iter)
-{
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
-
-void tracepoint_iter_reset(struct tracepoint_iter *iter)
-{
-#ifdef CONFIG_MODULES
-	iter->module = NULL;
-#endif /* CONFIG_MODULES */
-	iter->tracepoint = NULL;
-}
-EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 
 #ifdef CONFIG_MODULES
 static int tracepoint_module_coming(struct module *mod)
 {
-	struct tp_module *tp_mod, *iter;
+	struct tp_module *tp_mod;
 	int ret = 0;
 
 	if (!mod->num_tracepoints)
@@ -677,23 +481,7 @@ static int tracepoint_module_coming(struct module *mod)
 	}
 	tp_mod->num_tracepoints = mod->num_tracepoints;
 	tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
-
-	/*
-	 * tracepoint_module_list is kept sorted by struct module pointer
-	 * address for iteration on tracepoints from a seq_file that can release
-	 * the mutex between calls.
-	 */
-	list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
-		BUG_ON(iter == tp_mod);	/* Should never be in the list twice */
-		if (iter < tp_mod) {
-			/* We belong to the location right after iter. */
-			list_add(&tp_mod->list, &iter->list);
-			goto module_added;
-		}
-	}
-	/* We belong to the beginning of the list */
-	list_add(&tp_mod->list, &tracepoint_module_list);
-module_added:
+	list_add_tail(&tp_mod->list, &tracepoint_module_list);
 	tracepoint_update_probe_range(mod->tracepoints_ptrs,
 		mod->tracepoints_ptrs + mod->num_tracepoints);
 end:
-- 
cgit v1.3-14-g43fede


From d6ee6d2325faeec3fb0122a4840678a2ba62b04b Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Sat, 22 Mar 2014 12:20:31 +0400
Subject: genirq: Export symbol no_action()

This will allow to use the dummy IRQ handler no_action() from drivers
compiled as module. Drivers which use ARM FIQ interrupts can use this
to request the interrupt via the normal request_irq() mechanism w/o
having to copy the dummy handler to their own code.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Link: http://lkml.kernel.org/r/1395476431-16070-1-git-send-email-shc_work@mail.ru
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/handle.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index bfec453557b4..635480270858 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -41,6 +41,7 @@ irqreturn_t no_action(int cpl, void *dev_id)
 {
 	return IRQ_NONE;
 }
+EXPORT_SYMBOL_GPL(no_action);
 
 static void warn_no_thread(unsigned int irq, struct irqaction *action)
 {
-- 
cgit v1.3-14-g43fede


From 01a971406177c2ca9834be6914a67e20f463a3e6 Mon Sep 17 00:00:00 2001
From: Monam Agarwal <monamagarwal123@gmail.com>
Date: Mon, 24 Mar 2014 00:17:18 +0530
Subject: cgroup: Use RCU_INIT_POINTER(x, NULL) in cgroup.c

This patch replaces rcu_assign_pointer(x, NULL) with
RCU_INIT_POINTER(x, NULL)

The rcu_assign_pointer() ensures that the initialization of a
structure is carried out before storing a pointer to that structure.
And in the case of the NULL pointer, there is no structure to
initialize.  So, rcu_assign_pointer(p, NULL) can be safely converted
to RCU_INIT_POINTER(p, NULL)

Signed-off-by: Monam Agarwal <monamagarwal123@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 58c67b3060b5..e378cb2fac5e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3545,7 +3545,7 @@ static void css_release(struct percpu_ref *ref)
 	struct cgroup_subsys_state *css =
 		container_of(ref, struct cgroup_subsys_state, refcnt);
 
-	rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL);
+	RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
 	call_rcu(&css->rcu_head, css_free_rcu_fn);
 }
 
-- 
cgit v1.3-14-g43fede


From 3862807880acc0adaef6749738d210c9f45c3049 Mon Sep 17 00:00:00 2001
From: Aaron Tomlin <atomlin@redhat.com>
Date: Mon, 24 Mar 2014 14:03:57 +0000
Subject: tracing: Add BUG_ON when stack end location is over written

It is difficult to detect a stack overrun when it
actually occurs.

We have observed that this type of corruption is often
silent and can go unnoticed. Once the corrupted region
is examined, the outcome is undefined and often
results in sporadic system crashes.

When the stack tracing feature is enabled, let's check
for this condition and take appropriate action.

Note: init_task doesn't get its stack end location
set to STACK_END_MAGIC.

Link: http://lkml.kernel.org/r/1395669837-30209-1-git-send-email-atomlin@redhat.com

Signed-off-by: Aaron Tomlin <atomlin@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index e6be585cf06a..21b320e5d163 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,7 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/magic.h>
 
 #include <asm/setup.h>
 
@@ -144,6 +145,8 @@ check_stack(unsigned long ip, unsigned long *stack)
 			i++;
 	}
 
+	BUG_ON(current != &init_task &&
+		*(end_of_stack(current)) != STACK_END_MAGIC);
  out:
 	arch_spin_unlock(&max_stack_lock);
 	local_irq_restore(flags);
-- 
cgit v1.3-14-g43fede


From e231d54c1239ccf31aaee311bed0c4d1937cae2c Mon Sep 17 00:00:00 2001
From: Monam Agarwal <monamagarwal123@gmail.com>
Date: Mon, 24 Mar 2014 00:16:19 +0530
Subject: kernel: Use RCU_INIT_POINTER(x, NULL) in audit.c

This patch replaces rcu_assign_pointer(x, NULL) with RCU_INIT_POINTER(x, NULL)

The rcu_assign_pointer() ensures that the initialization of a structure
is carried out before storing a pointer to that structure.
And in the case of the NULL pointer, there is no structure to initialize.
So, rcu_assign_pointer(p, NULL) can be safely converted to RCU_INIT_POINTER(p, NULL)

Signed-off-by: Monam Agarwal <monamagarwal123@gmail.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index c0696dcfed11..ad77d1e80895 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1092,7 +1092,7 @@ static void __net_exit audit_net_exit(struct net *net)
 		audit_sock = NULL;
 	}
 
-	rcu_assign_pointer(aunet->nlsk, NULL);
+	RCU_INIT_POINTER(aunet->nlsk, NULL);
 	synchronize_net();
 	netlink_kernel_release(sock);
 }
-- 
cgit v1.3-14-g43fede


From ea2e64f280d2a34a8ed9ae3d783cd770d14b70ec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 23 Mar 2014 14:20:44 +0000
Subject: workqueue: Provide destroy_delayed_work_on_stack()

If a delayed or deferrable work is on stack we need to tell debug
objects that we are destroying the timer and the work. Otherwise we
leak the tracking object.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Acked-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/20140323141939.911487677@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/workqueue.h | 2 ++
 kernel/workqueue.c        | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 594521ba0d43..abdbe5af119d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -191,6 +191,7 @@ struct execute_work {
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 extern void __init_work(struct work_struct *work, int onstack);
 extern void destroy_work_on_stack(struct work_struct *work);
+extern void destroy_delayed_work_on_stack(struct delayed_work *work);
 static inline unsigned int work_static(struct work_struct *work)
 {
 	return *work_data_bits(work) & WORK_STRUCT_STATIC;
@@ -198,6 +199,7 @@ static inline unsigned int work_static(struct work_struct *work)
 #else
 static inline void __init_work(struct work_struct *work, int onstack) { }
 static inline void destroy_work_on_stack(struct work_struct *work) { }
+static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
 static inline unsigned int work_static(struct work_struct *work) { return 0; }
 #endif
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3b7473..5b690b5a9e74 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -516,6 +516,13 @@ void destroy_work_on_stack(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(destroy_work_on_stack);
 
+void destroy_delayed_work_on_stack(struct delayed_work *work)
+{
+	destroy_timer_on_stack(&work->timer);
+	debug_object_free(&work->work, &work_debug_descr);
+}
+EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);
+
 #else
 static inline void debug_work_activate(struct work_struct *work) { }
 static inline void debug_work_deactivate(struct work_struct *work) { }
-- 
cgit v1.3-14-g43fede


From cacb3c76c2012ade52124e8c6fdc5cb125625772 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 25 Mar 2014 16:09:18 +0530
Subject: tick: Fix spelling mistake in tick_handle_periodic()

One of the comments in tick_handle_periodic() had 'when' instead of 'which' (My
guess :)). Fix it.

Also fix spelling mistake in 'Possible'.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: skarafotis@gmail.com
Link: http://lkml.kernel.org/r/2b29ca4230c163e44179941d7c7a16c1474385c2.1395743878.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 20b2fe37d105..0fec63414fb6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -118,7 +118,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 		 * to be sure we're using a real hardware clocksource.
 		 * Otherwise we could get trapped in an infinite
 		 * loop, as the tick_periodic() increments jiffies,
-		 * when then will increment time, posibly causing
+		 * which then will increment time, possibly causing
 		 * the loop to trigger again and again.
 		 */
 		if (timekeeping_valid_for_hres())
-- 
cgit v1.3-14-g43fede


From b97f0291a2504291aef850077f98cab68a5a2f33 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 25 Mar 2014 13:56:23 +0530
Subject: tick: Remove code duplication in tick_handle_periodic()

tick_handle_periodic() is calling ktime_add() at two places, first before the
infinite loop and then at the end of infinite loop. We can rearrange code a bit
to fix code duplication here.

It looks quite simple and shouldn't break anything, I guess :)

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: fweisbec@gmail.com
Link: http://lkml.kernel.org/r/be3481e8f3f71df694a4b43623254fc93ca51b59.1395735873.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-common.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0fec63414fb6..015661279b68 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -98,18 +98,19 @@ static void tick_periodic(int cpu)
 void tick_handle_periodic(struct clock_event_device *dev)
 {
 	int cpu = smp_processor_id();
-	ktime_t next;
+	ktime_t next = dev->next_event;
 
 	tick_periodic(cpu);
 
 	if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
 		return;
-	/*
-	 * Setup the next period for devices, which do not have
-	 * periodic mode:
-	 */
-	next = ktime_add(dev->next_event, tick_period);
 	for (;;) {
+		/*
+		 * Setup the next period for devices, which do not have
+		 * periodic mode:
+		 */
+		next = ktime_add(next, tick_period);
+
 		if (!clockevents_program_event(dev, next, false))
 			return;
 		/*
@@ -123,7 +124,6 @@ void tick_handle_periodic(struct clock_event_device *dev)
 		 */
 		if (timekeeping_valid_for_hres())
 			tick_periodic(cpu);
-		next = ktime_add(next, tick_period);
 	}
 }
 
-- 
cgit v1.3-14-g43fede


From 2c4a33aba5f9ea3a28f2e40351f078d95f00786b Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 25 Mar 2014 23:39:41 -0400
Subject: tracing: Fix traceon trigger condition to actually turn tracing on

While working on my tutorial for 2014 Linux Collaboration Summit
I found that the traceon trigger did not work when conditions were
used. The other triggers worked fine though. Looking into it, it
is because of the way the triggers use the ring buffer to store
the fields it will use for the condition. But if tracing is off, nothing
is stored in the buffer, and the tracepoint exits before calling the
trigger to test the condition. This is fine for all the triggers that
only work when tracing is on, but for traceon trigger that is to
work when tracing is off, nothing happens.

The fix is simple, just use a temp ring buffer to record the event
if tracing is off and the event has a trace event conditional trigger
enabled. The rest of the tracepoint code will work just fine, but
the tracepoint wont be recorded in the other buffers.

Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409b..24c1f2382557 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1600,15 +1600,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
+static struct ring_buffer *temp_buffer;
+
 struct ring_buffer_event *
 trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 			  struct ftrace_event_file *ftrace_file,
 			  int type, unsigned long len,
 			  unsigned long flags, int pc)
 {
+	struct ring_buffer_event *entry;
+
 	*current_rb = ftrace_file->tr->trace_buffer.buffer;
-	return trace_buffer_lock_reserve(*current_rb,
+	entry = trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
+	/*
+	 * If tracing is off, but we have triggers enabled
+	 * we still need to look at the event data. Use the temp_buffer
+	 * to store the trace event for the tigger to use. It's recusive
+	 * safe and will not be recorded anywhere.
+	 */
+	if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+		*current_rb = temp_buffer;
+		entry = trace_buffer_lock_reserve(*current_rb,
+						  type, len, flags, pc);
+	}
+	return entry;
 }
 EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
 
@@ -6494,11 +6510,16 @@ __init static int tracer_alloc_buffers(void)
 
 	raw_spin_lock_init(&global_trace.start_lock);
 
+	/* Used for event triggers */
+	temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+	if (!temp_buffer)
+		goto out_free_cpumask;
+
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
-		goto out_free_cpumask;
+		goto out_free_temp_buffer;
 	}
 
 	if (global_trace.buffer_disabled)
@@ -6540,6 +6561,8 @@ __init static int tracer_alloc_buffers(void)
 
 	return 0;
 
+out_free_temp_buffer:
+	ring_buffer_free(temp_buffer);
 out_free_cpumask:
 	free_percpu(global_trace.trace_buffer.data);
 #ifdef CONFIG_TRACER_MAX_TRACE
-- 
cgit v1.3-14-g43fede


From cab5e127eef040399902caa8e1510795583fa03a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 27 Mar 2014 16:30:49 -0700
Subject: time: Revert to calling clock_was_set_delayed() while in irq context

In commit 47a1b796306356f35 ("tick/timekeeping: Call
update_wall_time outside the jiffies lock"), we moved to calling
clock_was_set() due to the fact that we were no longer holding
the timekeeping or jiffies lock.

However, there is still the problem that clock_was_set()
triggers an IPI, which cannot be done from the timer's hard irq
context, and will generate WARN_ON warnings.

Apparently in my earlier testing, I'm guessing I didn't bump the
dmesg log level, so I somehow missed the WARN_ONs.

Thus we need to revert back to calling clock_was_set_delayed().

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1395963049-11923-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc16..5b40279ecd71 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1435,7 +1435,8 @@ void update_wall_time(void)
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 	if (clock_set)
-		clock_was_set();
+		/* Have to call _delayed version, since in irq context*/
+		clock_was_set_delayed();
 }
 
 /**
-- 
cgit v1.3-14-g43fede


From e8604cb43690b781f9a7ad4a770f3e10259fe939 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 28 Mar 2014 15:18:27 +0800
Subject: cgroup: fix spurious lockdep warning in cgroup_exit()

cgroup_exit() is called in fork and exit path. If it's called in the
failure path during fork, PF_EXITING isn't set, and then lockdep will
complain.

Fix this by removing cgroup_exit() in that failure path. cgroup_fork()
does nothing that needs cleanup.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 3 +--
 kernel/fork.c   | 5 ++---
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e378cb2fac5e..60fd6f1f6d4e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4431,8 +4431,7 @@ void cgroup_post_fork(struct task_struct *child)
  * notify_on_release(), then leave the task attached to the root cgroup in
  * each hierarchy for the remainder of its exit.  No need to bother with
  * init_css_set refcnting.  init_css_set never goes away and we can't race
- * with migration path - either PF_EXITING is visible to migration path or
- * @tsk never got on the tasklist.
+ * with migration path - PF_EXITING is visible to migration path.
  */
 void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd42..8852b3463ab7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (IS_ERR(p->mempolicy)) {
 		retval = PTR_ERR(p->mempolicy);
 		p->mempolicy = NULL;
-		goto bad_fork_cleanup_cgroup;
+		goto bad_fork_cleanup_threadgroup_lock;
 	}
 	mpol_fix_fork_child_flag(p);
 #endif
@@ -1524,11 +1524,10 @@ bad_fork_cleanup_policy:
 	perf_event_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
-bad_fork_cleanup_cgroup:
+bad_fork_cleanup_threadgroup_lock:
 #endif
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
-	cgroup_exit(p, 0);
 	delayacct_tsk_free(p);
 	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
-- 
cgit v1.3-14-g43fede


From 1ec41830e087cda1f62dda4182c2b62811eb0ffc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 28 Mar 2014 15:22:19 +0800
Subject: cgroup: remove useless argument from cgroup_exit()

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 4 ++--
 kernel/cgroup.c        | 5 ++---
 kernel/exit.c          | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 43d1ed30bae3..c2515851c1aa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -33,7 +33,7 @@ extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+extern void cgroup_exit(struct task_struct *p);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 
@@ -843,7 +843,7 @@ static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
-static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
+static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 60fd6f1f6d4e..f7f94322d312 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4416,7 +4416,6 @@ void cgroup_post_fork(struct task_struct *child)
 /**
  * cgroup_exit - detach cgroup from exiting task
  * @tsk: pointer to task_struct of exiting process
- * @run_callback: run exit callbacks?
  *
  * Description: Detach cgroup from @tsk and release it.
  *
@@ -4433,7 +4432,7 @@ void cgroup_post_fork(struct task_struct *child)
  * init_css_set refcnting.  init_css_set never goes away and we can't race
  * with migration path - PF_EXITING is visible to migration path.
  */
-void cgroup_exit(struct task_struct *tsk, int run_callbacks)
+void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
@@ -4455,7 +4454,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
-	if (run_callbacks && need_forkexit_callback) {
+	if (need_forkexit_callback) {
 		/* see cgroup_post_fork() for details */
 		for_each_subsys(ss, i) {
 			if (ss->exit) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc645317..6480d1c85d7a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -797,7 +797,7 @@ void do_exit(long code)
 	 */
 	perf_event_exit_task(tsk);
 
-	cgroup_exit(tsk, 1);
+	cgroup_exit(tsk);
 
 	if (group_dead)
 		disassociate_ctty(1);
-- 
cgit v1.3-14-g43fede


From aa4af831bb4f3168f2f574b2620124699c09c4a3 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 30 Mar 2014 19:07:54 -0400
Subject: AUDIT: Allow login in non-init namespaces

It its possible to configure your PAM stack to refuse login if audit
messages (about the login) were unable to be sent.  This is common in
many distros and thus normal configuration of many containers.  The PAM
modules determine if audit is enabled/disabled in the kernel based on
the return value from sending an audit message on the netlink socket.
If userspace gets back ECONNREFUSED it believes audit is disabled in the
kernel.  If it gets any other error else it refuses to let the login
proceed.

Just about ever since the introduction of namespaces the kernel audit
subsystem has returned EPERM if the task sending a message was not in
the init user or pid namespace.  So many forms of containers have never
worked if audit was enabled in the kernel.

BUT if the container was not in net_init then the kernel network code
would send ECONNREFUSED (instead of the audit code sending EPERM).  Thus
by pure accident/dumb luck/bug if an admin configured the PAM stack to
reject all logins that didn't talk to audit, but then ran the login
untility in the non-init_net namespace, it would work!! Clearly this was
a bug, but it is a bug some people expected.

With the introduction of network namespace support in 3.14-rc1 the two
bugs stopped cancelling each other out.  Now, containers in the
non-init_net namespace refused to let users log in (just like PAM was
configfured!) Obviously some people were not happy that what used to let
users log in, now didn't!

This fix is kinda hacky.  We return ECONNREFUSED for all non-init
relevant namespaces.  That means that not only will the old broken
non-init_net setups continue to work, now the broken non-init_pid or
non-init_user setups will 'work'.  They don't really work, since audit
isn't logging things.  But it's what most users want.

In 3.15 we should have patches to support not only the non-init_net
(3.14) namespace but also the non-init_pid and non-init_user namespace.
So all will be right in the world.  This just opens the doors wide open
on 3.14 and hopefully makes users happy, if not the audit system...

Reported-by: Andre Tomt <andre@tomt.net>
Reported-by: Adam Richter <adam_richter2004@yahoo.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/audit.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 3392d3e0254a..95a20f3f52f1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -608,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	int err = 0;
 
 	/* Only support the initial namespaces for now. */
+	/*
+	 * We return ECONNREFUSED because it tricks userspace into thinking
+	 * that audit was not configured into the kernel.  Lots of users
+	 * configure their PAM stack (because that's what the distro does)
+	 * to reject login if unable to send messages to audit.  If we return
+	 * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+	 * configured in and will let login proceed.  If we return EPERM
+	 * userspace will reject all logins.  This should be removed when we
+	 * support non init namespaces!!
+	 */
 	if ((current_user_ns() != &init_user_ns) ||
 	    (task_active_pid_ns(current) != &init_pid_ns))
-		return -EPERM;
+		return -ECONNREFUSED;
 
 	switch (msg_type) {
 	case AUDIT_LIST:
-- 
cgit v1.3-14-g43fede


From 57673c2b0baa900dddae3b9eb3d7748ebf550eb3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 31 Mar 2014 14:39:57 +1030
Subject: Use 'E' instead of 'X' for unsigned module taint flag.

Takashi Iwai <tiwai@suse.de> says:
> The letter 'X' has been already used for SUSE kernels for very long
> time, to indicate the external supported modules.  Can the new flag be
> changed to another letter for avoiding conflict...?
> (BTW, we also use 'N' for "no support", too.)

Note: this code should be cleaned up, so we don't have such maps in
three places!

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 Documentation/ABI/testing/sysfs-module | 2 +-
 Documentation/module-signing.txt       | 2 +-
 Documentation/oops-tracing.txt         | 2 +-
 kernel/module.c                        | 2 +-
 kernel/panic.c                         | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ABI/testing/sysfs-module b/Documentation/ABI/testing/sysfs-module
index b9a29cdbaccb..0aac02e7fb0e 100644
--- a/Documentation/ABI/testing/sysfs-module
+++ b/Documentation/ABI/testing/sysfs-module
@@ -49,4 +49,4 @@ Description:	Module taint flags:
 			O - out-of-tree module
 			F - force-loaded module
 			C - staging driver module
-			X - unsigned module
+			E - unsigned module
diff --git a/Documentation/module-signing.txt b/Documentation/module-signing.txt
index b6af42e4d790..2429024c0749 100644
--- a/Documentation/module-signing.txt
+++ b/Documentation/module-signing.txt
@@ -54,7 +54,7 @@ This has a number of options available:
      If this is off (ie. "permissive"), then modules for which the key is not
      available and modules that are unsigned are permitted, but the kernel will
      be marked as being tainted, and the concerned modules will be marked as
-     tainted, shown with the character 'X'.
+     tainted, shown with the character 'E'.
 
      If this is on (ie. "restrictive"), only modules that have a valid
      signature that can be verified by a public key in the kernel's possession
diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt
index 879abe289523..e3155995ddd8 100644
--- a/Documentation/oops-tracing.txt
+++ b/Documentation/oops-tracing.txt
@@ -265,7 +265,7 @@ characters, each representing a particular tainted value.
 
  13: 'O' if an externally-built ("out-of-tree") module has been loaded.
 
- 14: 'X' if an unsigned module has been loaded in a kernel supporting
+ 14: 'E' if an unsigned module has been loaded in a kernel supporting
      module signature.
 
 The primary reason for the 'Tainted: ' string is to tell kernel
diff --git a/kernel/module.c b/kernel/module.c
index c1acb0c5b637..5806e096d110 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1014,7 +1014,7 @@ static size_t module_flags_taint(struct module *mod, char *buf)
 	if (mod->taints & (1 << TAINT_CRAP))
 		buf[l++] = 'C';
 	if (mod->taints & (1 << TAINT_UNSIGNED_MODULE))
-		buf[l++] = 'X';
+		buf[l++] = 'E';
 	/*
 	 * TAINT_FORCED_RMMOD: could be added.
 	 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
diff --git a/kernel/panic.c b/kernel/panic.c
index 0e25fe10871e..02b6c9f0171b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -210,7 +210,7 @@ static const struct tnt tnts[] = {
 	{ TAINT_CRAP,			'C', ' ' },
 	{ TAINT_FIRMWARE_WORKAROUND,	'I', ' ' },
 	{ TAINT_OOT_MODULE,		'O', ' ' },
-	{ TAINT_UNSIGNED_MODULE,	'X', ' ' },
+	{ TAINT_UNSIGNED_MODULE,	'E', ' ' },
 };
 
 /**
@@ -229,7 +229,7 @@ static const struct tnt tnts[] = {
  *  'C' - modules from drivers/staging are loaded.
  *  'I' - Working around severe firmware bug.
  *  'O' - Out-of-tree module has been loaded.
- *  'X' - Unsigned module has been loaded.
+ *  'E' - Unsigned module has been loaded.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
-- 
cgit v1.3-14-g43fede


From bd4cf0ed331a275e9bf5a49e6d0fd55dffc551b8 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Fri, 28 Mar 2014 18:58:25 +0100
Subject: net: filter: rework/optimize internal BPF interpreter's instruction
 set

This patch replaces/reworks the kernel-internal BPF interpreter with
an optimized BPF instruction set format that is modelled closer to
mimic native instruction sets and is designed to be JITed with one to
one mapping. Thus, the new interpreter is noticeably faster than the
current implementation of sk_run_filter(); mainly for two reasons:

1. Fall-through jumps:

  BPF jump instructions are forced to go either 'true' or 'false'
  branch which causes branch-miss penalty. The new BPF jump
  instructions have only one branch and fall-through otherwise,
  which fits the CPU branch predictor logic better. `perf stat`
  shows drastic difference for branch-misses between the old and
  new code.

2. Jump-threaded implementation of interpreter vs switch
   statement:

  Instead of single table-jump at the top of 'switch' statement,
  gcc will now generate multiple table-jump instructions, which
  helps CPU branch predictor logic.

Note that the verification of filters is still being done through
sk_chk_filter() in classical BPF format, so filters from user- or
kernel space are verified in the same way as we do now, and same
restrictions/constraints hold as well.

We reuse current BPF JIT compilers in a way that this upgrade would
even be fine as is, but nevertheless allows for a successive upgrade
of BPF JIT compilers to the new format.

The internal instruction set migration is being done after the
probing for JIT compilation, so in case JIT compilers are able to
create a native opcode image, we're going to use that, and in all
other cases we're doing a follow-up migration of the BPF program's
instruction set, so that it can be transparently run in the new
interpreter.

In short, the *internal* format extends BPF in the following way (more
details can be taken from the appended documentation):

  - Number of registers increase from 2 to 10
  - Register width increases from 32-bit to 64-bit
  - Conditional jt/jf targets replaced with jt/fall-through
  - Adds signed > and >= insns
  - 16 4-byte stack slots for register spill-fill replaced
    with up to 512 bytes of multi-use stack space
  - Introduction of bpf_call insn and register passing convention
    for zero overhead calls from/to other kernel functions
  - Adds arithmetic right shift and endianness conversion insns
  - Adds atomic_add insn
  - Old tax/txa insns are replaced with 'mov dst,src' insn

Performance of two BPF filters generated by libpcap resp. bpf_asm
was measured on x86_64, i386 and arm32 (other libpcap programs
have similar performance differences):

fprog #1 is taken from Documentation/networking/filter.txt:
tcpdump -i eth0 port 22 -dd

fprog #2 is taken from 'man tcpdump':
tcpdump -i eth0 'tcp port 22 and (((ip[2:2] - ((ip[0]&0xf)<<2)) -
   ((tcp[12]&0xf0)>>2)) != 0)' -dd

Raw performance data from BPF micro-benchmark: SK_RUN_FILTER on the
same SKB (cache-hit) or 10k SKBs (cache-miss); time in ns per call,
smaller is better:

--x86_64--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF      90       101        192       202
new BPF      31        71         47        97
old BPF jit  12        34         17        44
new BPF jit TBD

--i386--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF     107       136        227       252
new BPF      40       119         69       172

--arm32--
         fprog #1  fprog #1   fprog #2  fprog #2
         cache-hit cache-miss cache-hit cache-miss
old BPF     202       300        475       540
new BPF     180       270        330       470
old BPF jit  26       182         37       202
new BPF jit TBD

Thus, without changing any userland BPF filters, applications on
top of AF_PACKET (or other families) such as libpcap/tcpdump, cls_bpf
classifier, netfilter's xt_bpf, team driver's load-balancing mode,
and many more will have better interpreter filtering performance.

While we are replacing the internal BPF interpreter, we also need
to convert seccomp BPF in the same step to make use of the new
internal structure since it makes use of lower-level API details
without being further decoupled through higher-level calls like
sk_unattached_filter_{create,destroy}(), for example.

Just as for normal socket filtering, also seccomp BPF experiences
a time-to-verdict speedup:

05-sim-long_jumps.c of libseccomp was used as micro-benchmark:

  seccomp_rule_add_exact(ctx,...
  seccomp_rule_add_exact(ctx,...

  rc = seccomp_load(ctx);

  for (i = 0; i < 10000000; i++)
     syscall(199, 100);

'short filter' has 2 rules
'large filter' has 200 rules

'short filter' performance is slightly better on x86_64/i386/arm32
'large filter' is much faster on x86_64 and i386 and shows no
               difference on arm32

--x86_64-- short filter
old BPF: 2.7 sec
 39.12%  bench  libc-2.15.so       [.] syscall
  8.10%  bench  [kernel.kallsyms]  [k] sk_run_filter
  6.31%  bench  [kernel.kallsyms]  [k] system_call
  5.59%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  4.37%  bench  [kernel.kallsyms]  [k] trace_hardirqs_off_caller
  3.70%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.67%  bench  [kernel.kallsyms]  [k] lock_is_held
  3.03%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
new BPF: 2.58 sec
 42.05%  bench  libc-2.15.so       [.] syscall
  6.91%  bench  [kernel.kallsyms]  [k] system_call
  6.25%  bench  [kernel.kallsyms]  [k] trace_hardirqs_on_caller
  6.07%  bench  [kernel.kallsyms]  [k] __secure_computing
  5.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp

--arm32-- short filter
old BPF: 4.0 sec
 39.92%  bench  [kernel.kallsyms]  [k] vector_swi
 16.60%  bench  [kernel.kallsyms]  [k] sk_run_filter
 14.66%  bench  libc-2.17.so       [.] syscall
  5.42%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  5.10%  bench  [kernel.kallsyms]  [k] __secure_computing
new BPF: 3.7 sec
 35.93%  bench  [kernel.kallsyms]  [k] vector_swi
 21.89%  bench  libc-2.17.so       [.] syscall
 13.45%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
  6.25%  bench  [kernel.kallsyms]  [k] __secure_computing
  3.96%  bench  [kernel.kallsyms]  [k] syscall_trace_exit

--x86_64-- large filter
old BPF: 8.6 seconds
    73.38%    bench  [kernel.kallsyms]  [k] sk_run_filter
    10.70%    bench  libc-2.15.so       [.] syscall
     5.09%    bench  [kernel.kallsyms]  [k] seccomp_bpf_load
     1.97%    bench  [kernel.kallsyms]  [k] system_call
new BPF: 5.7 seconds
    66.20%    bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
    16.75%    bench  libc-2.15.so       [.] syscall
     3.31%    bench  [kernel.kallsyms]  [k] system_call
     2.88%    bench  [kernel.kallsyms]  [k] __secure_computing

--i386-- large filter
old BPF: 5.4 sec
new BPF: 3.8 sec

--arm32-- large filter
old BPF: 13.5 sec
 73.88%  bench  [kernel.kallsyms]  [k] sk_run_filter
 10.29%  bench  [kernel.kallsyms]  [k] vector_swi
  6.46%  bench  libc-2.17.so       [.] syscall
  2.94%  bench  [kernel.kallsyms]  [k] seccomp_bpf_load
  1.19%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.87%  bench  [kernel.kallsyms]  [k] sys_getuid
new BPF: 13.5 sec
 76.08%  bench  [kernel.kallsyms]  [k] sk_run_filter_int_seccomp
 10.98%  bench  [kernel.kallsyms]  [k] vector_swi
  5.87%  bench  libc-2.17.so       [.] syscall
  1.77%  bench  [kernel.kallsyms]  [k] __secure_computing
  0.93%  bench  [kernel.kallsyms]  [k] sys_getuid

BPF filters generated by seccomp are very branchy, so the new
internal BPF performance is better than the old one. Performance
gains will be even higher when BPF JIT is committed for the
new structure, which is planned in future work (as successive
JIT migrations).

BPF has also been stress-tested with trinity's BPF fuzzer.

Joint work with Daniel Borkmann.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Kees Cook <keescook@chromium.org>
Cc: Paul Moore <pmoore@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: linux-kernel@vger.kernel.org
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h  |   74 ++-
 include/linux/seccomp.h |    1 -
 kernel/seccomp.c        |  119 ++--
 net/core/filter.c       | 1457 +++++++++++++++++++++++++++++++++++++----------
 4 files changed, 1279 insertions(+), 372 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 9bde3ed19fe6..262dcbb75ffe 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -9,13 +9,58 @@
 #include <linux/workqueue.h>
 #include <uapi/linux/filter.h>
 
-#ifdef CONFIG_COMPAT
-/*
- * A struct sock_filter is architecture independent.
+/* Internally used and optimized filter representation with extended
+ * instruction set based on top of classic BPF.
  */
+
+/* instruction classes */
+#define BPF_ALU64	0x07	/* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW		0x18	/* double word */
+#define BPF_XADD	0xc0	/* exclusive add */
+
+/* alu/jmp fields */
+#define BPF_MOV		0xb0	/* mov reg to reg */
+#define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END		0xd0	/* flags for endianness conversion: */
+#define BPF_TO_LE	0x00	/* convert to little-endian */
+#define BPF_TO_BE	0x08	/* convert to big-endian */
+#define BPF_FROM_LE	BPF_TO_LE
+#define BPF_FROM_BE	BPF_TO_BE
+
+#define BPF_JNE		0x50	/* jump != */
+#define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
+#define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_CALL	0x80	/* function call */
+#define BPF_EXIT	0x90	/* function return */
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG	11
+
+/* BPF program can access up to 512 bytes of stack space. */
+#define MAX_BPF_STACK	512
+
+/* Arg1, context and stack frame pointer register positions. */
+#define ARG1_REG	1
+#define CTX_REG		6
+#define FP_REG		10
+
+struct sock_filter_int {
+	__u8	code;		/* opcode */
+	__u8	a_reg:4;	/* dest register */
+	__u8	x_reg:4;	/* source register */
+	__s16	off;		/* signed offset */
+	__s32	imm;		/* signed immediate constant */
+};
+
+#ifdef CONFIG_COMPAT
+/* A struct sock_filter is architecture independent. */
 struct compat_sock_fprog {
 	u16		len;
-	compat_uptr_t	filter;		/* struct sock_filter * */
+	compat_uptr_t	filter;	/* struct sock_filter * */
 };
 #endif
 
@@ -26,6 +71,7 @@ struct sock_fprog_kern {
 
 struct sk_buff;
 struct sock;
+struct seccomp_data;
 
 struct sk_filter {
 	atomic_t		refcnt;
@@ -34,9 +80,10 @@ struct sk_filter {
 	struct sock_fprog_kern	*orig_prog;	/* Original BPF program */
 	struct rcu_head		rcu;
 	unsigned int		(*bpf_func)(const struct sk_buff *skb,
-					    const struct sock_filter *filter);
+					    const struct sock_filter_int *filter);
 	union {
-		struct sock_filter     	insns[0];
+		struct sock_filter	insns[0];
+		struct sock_filter_int	insnsi[0];
 		struct work_struct	work;
 	};
 };
@@ -50,9 +97,18 @@ static inline unsigned int sk_filter_size(unsigned int proglen)
 #define sk_filter_proglen(fprog)			\
 		(fprog->len * sizeof(fprog->filter[0]))
 
+#define SK_RUN_FILTER(filter, ctx)			\
+		(*filter->bpf_func)(ctx, filter->insnsi)
+
 int sk_filter(struct sock *sk, struct sk_buff *skb);
-unsigned int sk_run_filter(const struct sk_buff *skb,
-			   const struct sock_filter *filter);
+
+u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
+			      const struct sock_filter_int *insni);
+u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
+			  const struct sock_filter_int *insni);
+
+int sk_convert_filter(struct sock_filter *prog, int len,
+		      struct sock_filter_int *new_prog, int *new_len);
 
 int sk_unattached_filter_create(struct sk_filter **pfp,
 				struct sock_fprog *fprog);
@@ -86,7 +142,6 @@ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
 		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
 			       16, 1, image, proglen, false);
 }
-#define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
 #else
 #include <linux/slab.h>
 static inline void bpf_jit_compile(struct sk_filter *fp)
@@ -96,7 +151,6 @@ static inline void bpf_jit_free(struct sk_filter *fp)
 {
 	kfree(fp);
 }
-#define SK_RUN_FILTER(FILTER, SKB) sk_run_filter(SKB, FILTER->insns)
 #endif
 
 static inline int bpf_tell_extensions(void)
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 6f19cfd1840e..4054b0994071 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s)
 #ifdef CONFIG_SECCOMP_FILTER
 extern void put_seccomp_filter(struct task_struct *tsk);
 extern void get_seccomp_filter(struct task_struct *tsk);
-extern u32 seccomp_bpf_load(int off);
 #else  /* CONFIG_SECCOMP_FILTER */
 static inline void put_seccomp_filter(struct task_struct *tsk)
 {
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index b7a10048a32c..4f18e754c23e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -55,60 +55,33 @@ struct seccomp_filter {
 	atomic_t usage;
 	struct seccomp_filter *prev;
 	unsigned short len;  /* Instruction count */
-	struct sock_filter insns[];
+	struct sock_filter_int insnsi[];
 };
 
 /* Limit any path through the tree to 256KB worth of instructions. */
 #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
 
-/**
- * get_u32 - returns a u32 offset into data
- * @data: a unsigned 64 bit value
- * @index: 0 or 1 to return the first or second 32-bits
- *
- * This inline exists to hide the length of unsigned long.  If a 32-bit
- * unsigned long is passed in, it will be extended and the top 32-bits will be
- * 0. If it is a 64-bit unsigned long, then whatever data is resident will be
- * properly returned.
- *
+/*
  * Endianness is explicitly ignored and left for BPF program authors to manage
  * as per the specific architecture.
  */
-static inline u32 get_u32(u64 data, int index)
+static void populate_seccomp_data(struct seccomp_data *sd)
 {
-	return ((u32 *)&data)[index];
-}
+	struct task_struct *task = current;
+	struct pt_regs *regs = task_pt_regs(task);
 
-/* Helper for bpf_load below. */
-#define BPF_DATA(_name) offsetof(struct seccomp_data, _name)
-/**
- * bpf_load: checks and returns a pointer to the requested offset
- * @off: offset into struct seccomp_data to load from
- *
- * Returns the requested 32-bits of data.
- * seccomp_check_filter() should assure that @off is 32-bit aligned
- * and not out of bounds.  Failure to do so is a BUG.
- */
-u32 seccomp_bpf_load(int off)
-{
-	struct pt_regs *regs = task_pt_regs(current);
-	if (off == BPF_DATA(nr))
-		return syscall_get_nr(current, regs);
-	if (off == BPF_DATA(arch))
-		return syscall_get_arch(current, regs);
-	if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) {
-		unsigned long value;
-		int arg = (off - BPF_DATA(args[0])) / sizeof(u64);
-		int index = !!(off % sizeof(u64));
-		syscall_get_arguments(current, regs, arg, 1, &value);
-		return get_u32(value, index);
-	}
-	if (off == BPF_DATA(instruction_pointer))
-		return get_u32(KSTK_EIP(current), 0);
-	if (off == BPF_DATA(instruction_pointer) + sizeof(u32))
-		return get_u32(KSTK_EIP(current), 1);
-	/* seccomp_check_filter should make this impossible. */
-	BUG();
+	sd->nr = syscall_get_nr(task, regs);
+	sd->arch = syscall_get_arch(task, regs);
+
+	/* Unroll syscall_get_args to help gcc on arm. */
+	syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
+	syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]);
+	syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]);
+	syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]);
+	syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]);
+	syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]);
+
+	sd->instruction_pointer = KSTK_EIP(task);
 }
 
 /**
@@ -133,17 +106,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 
 		switch (code) {
 		case BPF_S_LD_W_ABS:
-			ftest->code = BPF_S_ANC_SECCOMP_LD_W;
+			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
 			/* 32-bit aligned and not out of bounds. */
 			if (k >= sizeof(struct seccomp_data) || k & 3)
 				return -EINVAL;
 			continue;
 		case BPF_S_LD_W_LEN:
-			ftest->code = BPF_S_LD_IMM;
+			ftest->code = BPF_LD | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
 		case BPF_S_LDX_W_LEN:
-			ftest->code = BPF_S_LDX_IMM;
+			ftest->code = BPF_LDX | BPF_IMM;
 			ftest->k = sizeof(struct seccomp_data);
 			continue;
 		/* Explicitly include allowed calls. */
@@ -185,6 +158,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 		case BPF_S_JMP_JGT_X:
 		case BPF_S_JMP_JSET_K:
 		case BPF_S_JMP_JSET_X:
+			sk_decode_filter(ftest, ftest);
 			continue;
 		default:
 			return -EINVAL;
@@ -202,18 +176,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
 static u32 seccomp_run_filters(int syscall)
 {
 	struct seccomp_filter *f;
+	struct seccomp_data sd;
 	u32 ret = SECCOMP_RET_ALLOW;
 
 	/* Ensure unexpected behavior doesn't result in failing open. */
 	if (WARN_ON(current->seccomp.filter == NULL))
 		return SECCOMP_RET_KILL;
 
+	populate_seccomp_data(&sd);
+
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
 	 */
 	for (f = current->seccomp.filter; f; f = f->prev) {
-		u32 cur_ret = sk_run_filter(NULL, f->insns);
+		u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
 	}
@@ -231,6 +208,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	struct seccomp_filter *filter;
 	unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
 	unsigned long total_insns = fprog->len;
+	struct sock_filter *fp;
+	int new_len;
 	long ret;
 
 	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
@@ -252,28 +231,43 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 				     CAP_SYS_ADMIN) != 0)
 		return -EACCES;
 
-	/* Allocate a new seccomp_filter */
-	filter = kzalloc(sizeof(struct seccomp_filter) + fp_size,
-			 GFP_KERNEL|__GFP_NOWARN);
-	if (!filter)
+	fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
+	if (!fp)
 		return -ENOMEM;
-	atomic_set(&filter->usage, 1);
-	filter->len = fprog->len;
 
 	/* Copy the instructions from fprog. */
 	ret = -EFAULT;
-	if (copy_from_user(filter->insns, fprog->filter, fp_size))
-		goto fail;
+	if (copy_from_user(fp, fprog->filter, fp_size))
+		goto free_prog;
 
 	/* Check and rewrite the fprog via the skb checker */
-	ret = sk_chk_filter(filter->insns, filter->len);
+	ret = sk_chk_filter(fp, fprog->len);
 	if (ret)
-		goto fail;
+		goto free_prog;
 
 	/* Check and rewrite the fprog for seccomp use */
-	ret = seccomp_check_filter(filter->insns, filter->len);
+	ret = seccomp_check_filter(fp, fprog->len);
+	if (ret)
+		goto free_prog;
+
+	/* Convert 'sock_filter' insns to 'sock_filter_int' insns */
+	ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+	if (ret)
+		goto free_prog;
+
+	/* Allocate a new seccomp_filter */
+	filter = kzalloc(sizeof(struct seccomp_filter) +
+			 sizeof(struct sock_filter_int) * new_len,
+			 GFP_KERNEL|__GFP_NOWARN);
+	if (!filter)
+		goto free_prog;
+
+	ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
 	if (ret)
-		goto fail;
+		goto free_filter;
+
+	atomic_set(&filter->usage, 1);
+	filter->len = new_len;
 
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
@@ -282,8 +276,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	filter->prev = current->seccomp.filter;
 	current->seccomp.filter = filter;
 	return 0;
-fail:
+
+free_filter:
 	kfree(filter);
+free_prog:
+	kfree(fp);
 	return ret;
 }
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 5b3427aaeca5..3733381190ec 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,11 +1,16 @@
 /*
  * Linux Socket Filter - Kernel level socket filtering
  *
- * Author:
- *     Jay Schulist <jschlst@samba.org>
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
  *
- * Based on the design of:
- *     - The Berkeley Packet Filter
+ *	Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ *	Jay Schulist <jschlst@samba.org>
+ *	Alexei Starovoitov <ast@plumgrid.com>
+ *	Daniel Borkmann <dborkman@redhat.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -108,304 +113,1045 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(sk_filter);
 
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return 0;
+}
+
 /**
- *	sk_run_filter - run a filter on a socket
- *	@skb: buffer to run the filter on
+ *	__sk_run_filter - run a filter on a given context
+ *	@ctx: buffer to run the filter on
  *	@fentry: filter to apply
  *
- * Decode and apply filter instructions to the skb->data.
- * Return length to keep, 0 for none. @skb is the data we are
- * filtering, @filter is the array of filter instructions.
- * Because all jumps are guaranteed to be before last instruction,
- * and last instruction guaranteed to be a RET, we dont need to check
- * flen. (We used to pass to this function the length of filter)
+ * Decode and apply filter instructions to the skb->data. Return length to
+ * keep, 0 for none. @ctx is the data we are operating on, @filter is the
+ * array of filter instructions.
  */
-unsigned int sk_run_filter(const struct sk_buff *skb,
-			   const struct sock_filter *fentry)
+unsigned int __sk_run_filter(void *ctx, const struct sock_filter_int *insn)
 {
+	u64 stack[MAX_BPF_STACK / sizeof(u64)];
+	u64 regs[MAX_BPF_REG], tmp;
 	void *ptr;
-	u32 A = 0;			/* Accumulator */
-	u32 X = 0;			/* Index Register */
-	u32 mem[BPF_MEMWORDS];		/* Scratch Memory Store */
-	u32 tmp;
-	int k;
+	int off;
+
+#define K  insn->imm
+#define A  regs[insn->a_reg]
+#define X  regs[insn->x_reg]
+#define R0 regs[0]
+
+#define CONT	 ({insn++; goto select_insn; })
+#define CONT_JMP ({insn++; goto select_insn; })
+
+	static const void *jumptable[256] = {
+		[0 ... 255] = &&default_label,
+		/* Now overwrite non-defaults ... */
+#define DL(A, B, C)	[A|B|C] = &&A##_##B##_##C
+		DL(BPF_ALU, BPF_ADD, BPF_X),
+		DL(BPF_ALU, BPF_ADD, BPF_K),
+		DL(BPF_ALU, BPF_SUB, BPF_X),
+		DL(BPF_ALU, BPF_SUB, BPF_K),
+		DL(BPF_ALU, BPF_AND, BPF_X),
+		DL(BPF_ALU, BPF_AND, BPF_K),
+		DL(BPF_ALU, BPF_OR, BPF_X),
+		DL(BPF_ALU, BPF_OR, BPF_K),
+		DL(BPF_ALU, BPF_LSH, BPF_X),
+		DL(BPF_ALU, BPF_LSH, BPF_K),
+		DL(BPF_ALU, BPF_RSH, BPF_X),
+		DL(BPF_ALU, BPF_RSH, BPF_K),
+		DL(BPF_ALU, BPF_XOR, BPF_X),
+		DL(BPF_ALU, BPF_XOR, BPF_K),
+		DL(BPF_ALU, BPF_MUL, BPF_X),
+		DL(BPF_ALU, BPF_MUL, BPF_K),
+		DL(BPF_ALU, BPF_MOV, BPF_X),
+		DL(BPF_ALU, BPF_MOV, BPF_K),
+		DL(BPF_ALU, BPF_DIV, BPF_X),
+		DL(BPF_ALU, BPF_DIV, BPF_K),
+		DL(BPF_ALU, BPF_MOD, BPF_X),
+		DL(BPF_ALU, BPF_MOD, BPF_K),
+		DL(BPF_ALU, BPF_NEG, 0),
+		DL(BPF_ALU, BPF_END, BPF_TO_BE),
+		DL(BPF_ALU, BPF_END, BPF_TO_LE),
+		DL(BPF_ALU64, BPF_ADD, BPF_X),
+		DL(BPF_ALU64, BPF_ADD, BPF_K),
+		DL(BPF_ALU64, BPF_SUB, BPF_X),
+		DL(BPF_ALU64, BPF_SUB, BPF_K),
+		DL(BPF_ALU64, BPF_AND, BPF_X),
+		DL(BPF_ALU64, BPF_AND, BPF_K),
+		DL(BPF_ALU64, BPF_OR, BPF_X),
+		DL(BPF_ALU64, BPF_OR, BPF_K),
+		DL(BPF_ALU64, BPF_LSH, BPF_X),
+		DL(BPF_ALU64, BPF_LSH, BPF_K),
+		DL(BPF_ALU64, BPF_RSH, BPF_X),
+		DL(BPF_ALU64, BPF_RSH, BPF_K),
+		DL(BPF_ALU64, BPF_XOR, BPF_X),
+		DL(BPF_ALU64, BPF_XOR, BPF_K),
+		DL(BPF_ALU64, BPF_MUL, BPF_X),
+		DL(BPF_ALU64, BPF_MUL, BPF_K),
+		DL(BPF_ALU64, BPF_MOV, BPF_X),
+		DL(BPF_ALU64, BPF_MOV, BPF_K),
+		DL(BPF_ALU64, BPF_ARSH, BPF_X),
+		DL(BPF_ALU64, BPF_ARSH, BPF_K),
+		DL(BPF_ALU64, BPF_DIV, BPF_X),
+		DL(BPF_ALU64, BPF_DIV, BPF_K),
+		DL(BPF_ALU64, BPF_MOD, BPF_X),
+		DL(BPF_ALU64, BPF_MOD, BPF_K),
+		DL(BPF_ALU64, BPF_NEG, 0),
+		DL(BPF_JMP, BPF_CALL, 0),
+		DL(BPF_JMP, BPF_JA, 0),
+		DL(BPF_JMP, BPF_JEQ, BPF_X),
+		DL(BPF_JMP, BPF_JEQ, BPF_K),
+		DL(BPF_JMP, BPF_JNE, BPF_X),
+		DL(BPF_JMP, BPF_JNE, BPF_K),
+		DL(BPF_JMP, BPF_JGT, BPF_X),
+		DL(BPF_JMP, BPF_JGT, BPF_K),
+		DL(BPF_JMP, BPF_JGE, BPF_X),
+		DL(BPF_JMP, BPF_JGE, BPF_K),
+		DL(BPF_JMP, BPF_JSGT, BPF_X),
+		DL(BPF_JMP, BPF_JSGT, BPF_K),
+		DL(BPF_JMP, BPF_JSGE, BPF_X),
+		DL(BPF_JMP, BPF_JSGE, BPF_K),
+		DL(BPF_JMP, BPF_JSET, BPF_X),
+		DL(BPF_JMP, BPF_JSET, BPF_K),
+		DL(BPF_JMP, BPF_EXIT, 0),
+		DL(BPF_STX, BPF_MEM, BPF_B),
+		DL(BPF_STX, BPF_MEM, BPF_H),
+		DL(BPF_STX, BPF_MEM, BPF_W),
+		DL(BPF_STX, BPF_MEM, BPF_DW),
+		DL(BPF_STX, BPF_XADD, BPF_W),
+		DL(BPF_STX, BPF_XADD, BPF_DW),
+		DL(BPF_ST, BPF_MEM, BPF_B),
+		DL(BPF_ST, BPF_MEM, BPF_H),
+		DL(BPF_ST, BPF_MEM, BPF_W),
+		DL(BPF_ST, BPF_MEM, BPF_DW),
+		DL(BPF_LDX, BPF_MEM, BPF_B),
+		DL(BPF_LDX, BPF_MEM, BPF_H),
+		DL(BPF_LDX, BPF_MEM, BPF_W),
+		DL(BPF_LDX, BPF_MEM, BPF_DW),
+		DL(BPF_LD, BPF_ABS, BPF_W),
+		DL(BPF_LD, BPF_ABS, BPF_H),
+		DL(BPF_LD, BPF_ABS, BPF_B),
+		DL(BPF_LD, BPF_IND, BPF_W),
+		DL(BPF_LD, BPF_IND, BPF_H),
+		DL(BPF_LD, BPF_IND, BPF_B),
+#undef DL
+	};
 
-	/*
-	 * Process array of filter instructions.
-	 */
-	for (;; fentry++) {
-#if defined(CONFIG_X86_32)
-#define	K (fentry->k)
-#else
-		const u32 K = fentry->k;
-#endif
-
-		switch (fentry->code) {
-		case BPF_S_ALU_ADD_X:
-			A += X;
-			continue;
-		case BPF_S_ALU_ADD_K:
-			A += K;
-			continue;
-		case BPF_S_ALU_SUB_X:
-			A -= X;
-			continue;
-		case BPF_S_ALU_SUB_K:
-			A -= K;
-			continue;
-		case BPF_S_ALU_MUL_X:
-			A *= X;
-			continue;
-		case BPF_S_ALU_MUL_K:
-			A *= K;
-			continue;
-		case BPF_S_ALU_DIV_X:
-			if (X == 0)
-				return 0;
-			A /= X;
-			continue;
-		case BPF_S_ALU_DIV_K:
-			A /= K;
-			continue;
-		case BPF_S_ALU_MOD_X:
-			if (X == 0)
-				return 0;
-			A %= X;
-			continue;
-		case BPF_S_ALU_MOD_K:
-			A %= K;
-			continue;
-		case BPF_S_ALU_AND_X:
-			A &= X;
-			continue;
-		case BPF_S_ALU_AND_K:
-			A &= K;
-			continue;
-		case BPF_S_ALU_OR_X:
-			A |= X;
-			continue;
-		case BPF_S_ALU_OR_K:
-			A |= K;
-			continue;
-		case BPF_S_ANC_ALU_XOR_X:
-		case BPF_S_ALU_XOR_X:
-			A ^= X;
-			continue;
-		case BPF_S_ALU_XOR_K:
-			A ^= K;
-			continue;
-		case BPF_S_ALU_LSH_X:
-			A <<= X;
-			continue;
-		case BPF_S_ALU_LSH_K:
-			A <<= K;
-			continue;
-		case BPF_S_ALU_RSH_X:
-			A >>= X;
-			continue;
-		case BPF_S_ALU_RSH_K:
-			A >>= K;
-			continue;
-		case BPF_S_ALU_NEG:
-			A = -A;
-			continue;
-		case BPF_S_JMP_JA:
-			fentry += K;
-			continue;
-		case BPF_S_JMP_JGT_K:
-			fentry += (A > K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGE_K:
-			fentry += (A >= K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JEQ_K:
-			fentry += (A == K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JSET_K:
-			fentry += (A & K) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGT_X:
-			fentry += (A > X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JGE_X:
-			fentry += (A >= X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JEQ_X:
-			fentry += (A == X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_JMP_JSET_X:
-			fentry += (A & X) ? fentry->jt : fentry->jf;
-			continue;
-		case BPF_S_LD_W_ABS:
-			k = K;
-load_w:
-			ptr = load_pointer(skb, k, 4, &tmp);
-			if (ptr != NULL) {
-				A = get_unaligned_be32(ptr);
-				continue;
-			}
-			return 0;
-		case BPF_S_LD_H_ABS:
-			k = K;
-load_h:
-			ptr = load_pointer(skb, k, 2, &tmp);
-			if (ptr != NULL) {
-				A = get_unaligned_be16(ptr);
-				continue;
+	regs[FP_REG]  = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+	regs[ARG1_REG] = (u64) (unsigned long) ctx;
+
+select_insn:
+	goto *jumptable[insn->code];
+
+	/* ALU */
+#define ALU(OPCODE, OP)			\
+	BPF_ALU64_##OPCODE##_BPF_X:	\
+		A = A OP X;		\
+		CONT;			\
+	BPF_ALU_##OPCODE##_BPF_X:	\
+		A = (u32) A OP (u32) X;	\
+		CONT;			\
+	BPF_ALU64_##OPCODE##_BPF_K:	\
+		A = A OP K;		\
+		CONT;			\
+	BPF_ALU_##OPCODE##_BPF_K:	\
+		A = (u32) A OP (u32) K;	\
+		CONT;
+
+	ALU(BPF_ADD,  +)
+	ALU(BPF_SUB,  -)
+	ALU(BPF_AND,  &)
+	ALU(BPF_OR,   |)
+	ALU(BPF_LSH, <<)
+	ALU(BPF_RSH, >>)
+	ALU(BPF_XOR,  ^)
+	ALU(BPF_MUL,  *)
+#undef ALU
+	BPF_ALU_BPF_NEG_0:
+		A = (u32) -A;
+		CONT;
+	BPF_ALU64_BPF_NEG_0:
+		A = -A;
+		CONT;
+	BPF_ALU_BPF_MOV_BPF_X:
+		A = (u32) X;
+		CONT;
+	BPF_ALU_BPF_MOV_BPF_K:
+		A = (u32) K;
+		CONT;
+	BPF_ALU64_BPF_MOV_BPF_X:
+		A = X;
+		CONT;
+	BPF_ALU64_BPF_MOV_BPF_K:
+		A = K;
+		CONT;
+	BPF_ALU64_BPF_ARSH_BPF_X:
+		(*(s64 *) &A) >>= X;
+		CONT;
+	BPF_ALU64_BPF_ARSH_BPF_K:
+		(*(s64 *) &A) >>= K;
+		CONT;
+	BPF_ALU64_BPF_MOD_BPF_X:
+		tmp = A;
+		if (X)
+			A = do_div(tmp, X);
+		CONT;
+	BPF_ALU_BPF_MOD_BPF_X:
+		tmp = (u32) A;
+		if (X)
+			A = do_div(tmp, (u32) X);
+		CONT;
+	BPF_ALU64_BPF_MOD_BPF_K:
+		tmp = A;
+		if (K)
+			A = do_div(tmp, K);
+		CONT;
+	BPF_ALU_BPF_MOD_BPF_K:
+		tmp = (u32) A;
+		if (K)
+			A = do_div(tmp, (u32) K);
+		CONT;
+	BPF_ALU64_BPF_DIV_BPF_X:
+		if (X)
+			do_div(A, X);
+		CONT;
+	BPF_ALU_BPF_DIV_BPF_X:
+		tmp = (u32) A;
+		if (X)
+			do_div(tmp, (u32) X);
+		A = (u32) tmp;
+		CONT;
+	BPF_ALU64_BPF_DIV_BPF_K:
+		if (K)
+			do_div(A, K);
+		CONT;
+	BPF_ALU_BPF_DIV_BPF_K:
+		tmp = (u32) A;
+		if (K)
+			do_div(tmp, (u32) K);
+		A = (u32) tmp;
+		CONT;
+	BPF_ALU_BPF_END_BPF_TO_BE:
+		switch (K) {
+		case 16:
+			A = (__force u16) cpu_to_be16(A);
+			break;
+		case 32:
+			A = (__force u32) cpu_to_be32(A);
+			break;
+		case 64:
+			A = (__force u64) cpu_to_be64(A);
+			break;
+		}
+		CONT;
+	BPF_ALU_BPF_END_BPF_TO_LE:
+		switch (K) {
+		case 16:
+			A = (__force u16) cpu_to_le16(A);
+			break;
+		case 32:
+			A = (__force u32) cpu_to_le32(A);
+			break;
+		case 64:
+			A = (__force u64) cpu_to_le64(A);
+			break;
+		}
+		CONT;
+
+	/* CALL */
+	BPF_JMP_BPF_CALL_0:
+		/* Function call scratches R1-R5 registers, preserves R6-R9,
+		 * and stores return value into R0.
+		 */
+		R0 = (__bpf_call_base + insn->imm)(regs[1], regs[2], regs[3],
+						   regs[4], regs[5]);
+		CONT;
+
+	/* JMP */
+	BPF_JMP_BPF_JA_0:
+		insn += insn->off;
+		CONT;
+	BPF_JMP_BPF_JEQ_BPF_X:
+		if (A == X) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JEQ_BPF_K:
+		if (A == K) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JNE_BPF_X:
+		if (A != X) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JNE_BPF_K:
+		if (A != K) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JGT_BPF_X:
+		if (A > X) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JGT_BPF_K:
+		if (A > K) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JGE_BPF_X:
+		if (A >= X) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JGE_BPF_K:
+		if (A >= K) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSGT_BPF_X:
+		if (((s64)A) > ((s64)X)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSGT_BPF_K:
+		if (((s64)A) > ((s64)K)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSGE_BPF_X:
+		if (((s64)A) >= ((s64)X)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSGE_BPF_K:
+		if (((s64)A) >= ((s64)K)) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSET_BPF_X:
+		if (A & X) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_JSET_BPF_K:
+		if (A & K) {
+			insn += insn->off;
+			CONT_JMP;
+		}
+		CONT;
+	BPF_JMP_BPF_EXIT_0:
+		return R0;
+
+	/* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE)					\
+	BPF_STX_BPF_MEM_##SIZEOP:				\
+		*(SIZE *)(unsigned long) (A + insn->off) = X;	\
+		CONT;						\
+	BPF_ST_BPF_MEM_##SIZEOP:				\
+		*(SIZE *)(unsigned long) (A + insn->off) = K;	\
+		CONT;						\
+	BPF_LDX_BPF_MEM_##SIZEOP:				\
+		A = *(SIZE *)(unsigned long) (X + insn->off);	\
+		CONT;
+
+	LDST(BPF_B,   u8)
+	LDST(BPF_H,  u16)
+	LDST(BPF_W,  u32)
+	LDST(BPF_DW, u64)
+#undef LDST
+	BPF_STX_BPF_XADD_BPF_W: /* lock xadd *(u32 *)(A + insn->off) += X */
+		atomic_add((u32) X, (atomic_t *)(unsigned long)
+			   (A + insn->off));
+		CONT;
+	BPF_STX_BPF_XADD_BPF_DW: /* lock xadd *(u64 *)(A + insn->off) += X */
+		atomic64_add((u64) X, (atomic64_t *)(unsigned long)
+			     (A + insn->off));
+		CONT;
+	BPF_LD_BPF_ABS_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + K)) */
+		off = K;
+load_word:
+		/* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are only
+		 * appearing in the programs where ctx == skb. All programs
+		 * keep 'ctx' in regs[CTX_REG] == R6, sk_convert_filter()
+		 * saves it in R6, internal BPF verifier will check that
+		 * R6 == ctx.
+		 *
+		 * BPF_ABS and BPF_IND are wrappers of function calls, so
+		 * they scratch R1-R5 registers, preserve R6-R9, and store
+		 * return value into R0.
+		 *
+		 * Implicit input:
+		 *   ctx
+		 *
+		 * Explicit input:
+		 *   X == any register
+		 *   K == 32-bit immediate
+		 *
+		 * Output:
+		 *   R0 - 8/16/32-bit skb data converted to cpu endianness
+		 */
+		ptr = load_pointer((struct sk_buff *) ctx, off, 4, &tmp);
+		if (likely(ptr != NULL)) {
+			R0 = get_unaligned_be32(ptr);
+			CONT;
+		}
+		return 0;
+	BPF_LD_BPF_ABS_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + K)) */
+		off = K;
+load_half:
+		ptr = load_pointer((struct sk_buff *) ctx, off, 2, &tmp);
+		if (likely(ptr != NULL)) {
+			R0 = get_unaligned_be16(ptr);
+			CONT;
+		}
+		return 0;
+	BPF_LD_BPF_ABS_BPF_B: /* R0 = *(u8 *) (ctx + K) */
+		off = K;
+load_byte:
+		ptr = load_pointer((struct sk_buff *) ctx, off, 1, &tmp);
+		if (likely(ptr != NULL)) {
+			R0 = *(u8 *)ptr;
+			CONT;
+		}
+		return 0;
+	BPF_LD_BPF_IND_BPF_W: /* R0 = ntohl(*(u32 *) (skb->data + X + K)) */
+		off = K + X;
+		goto load_word;
+	BPF_LD_BPF_IND_BPF_H: /* R0 = ntohs(*(u16 *) (skb->data + X + K)) */
+		off = K + X;
+		goto load_half;
+	BPF_LD_BPF_IND_BPF_B: /* R0 = *(u8 *) (skb->data + X + K) */
+		off = K + X;
+		goto load_byte;
+
+	default_label:
+		/* If we ever reach this, we have a bug somewhere. */
+		WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+		return 0;
+#undef CONT_JMP
+#undef CONT
+
+#undef R0
+#undef X
+#undef A
+#undef K
+}
+
+u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
+			      const struct sock_filter_int *insni)
+    __attribute__ ((alias ("__sk_run_filter")));
+
+u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
+			  const struct sock_filter_int *insni)
+    __attribute__ ((alias ("__sk_run_filter")));
+EXPORT_SYMBOL_GPL(sk_run_filter_int_skb);
+
+/* Helper to find the offset of pkt_type in sk_buff structure. We want
+ * to make sure its still a 3bit field starting at a byte boundary;
+ * taken from arch/x86/net/bpf_jit_comp.c.
+ */
+#define PKT_TYPE_MAX	7
+static unsigned int pkt_type_offset(void)
+{
+	struct sk_buff skb_probe = { .pkt_type = ~0, };
+	u8 *ct = (u8 *) &skb_probe;
+	unsigned int off;
+
+	for (off = 0; off < sizeof(struct sk_buff); off++) {
+		if (ct[off] == PKT_TYPE_MAX)
+			return off;
+	}
+
+	pr_err_once("Please fix %s, as pkt_type couldn't be found!\n", __func__);
+	return -1;
+}
+
+static u64 __skb_get_pay_offset(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
+
+	return __skb_get_poff(skb);
+}
+
+static u64 __skb_get_nlattr(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
+	struct nlattr *nla;
+
+	if (skb_is_nonlinear(skb))
+		return 0;
+
+	if (A > skb->len - sizeof(struct nlattr))
+		return 0;
+
+	nla = nla_find((struct nlattr *) &skb->data[A], skb->len - A, X);
+	if (nla)
+		return (void *) nla - (void *) skb->data;
+
+	return 0;
+}
+
+static u64 __skb_get_nlattr_nest(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *)(long) ctx;
+	struct nlattr *nla;
+
+	if (skb_is_nonlinear(skb))
+		return 0;
+
+	if (A > skb->len - sizeof(struct nlattr))
+		return 0;
+
+	nla = (struct nlattr *) &skb->data[A];
+	if (nla->nla_len > A - skb->len)
+		return 0;
+
+	nla = nla_find_nested(nla, X);
+	if (nla)
+		return (void *) nla - (void *) skb->data;
+
+	return 0;
+}
+
+static u64 __get_raw_cpu_id(u64 ctx, u64 A, u64 X, u64 r4, u64 r5)
+{
+	return raw_smp_processor_id();
+}
+
+/* Register mappings for user programs. */
+#define A_REG		0
+#define X_REG		7
+#define TMP_REG		8
+#define ARG2_REG	2
+#define ARG3_REG	3
+
+static bool convert_bpf_extensions(struct sock_filter *fp,
+				   struct sock_filter_int **insnp)
+{
+	struct sock_filter_int *insn = *insnp;
+
+	switch (fp->k) {
+	case SKF_AD_OFF + SKF_AD_PROTOCOL:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+
+		insn->code = BPF_LDX | BPF_MEM | BPF_H;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, protocol);
+		insn++;
+
+		/* A = ntohs(A) [emitting a nop or swap16] */
+		insn->code = BPF_ALU | BPF_END | BPF_FROM_BE;
+		insn->a_reg = A_REG;
+		insn->imm = 16;
+		break;
+
+	case SKF_AD_OFF + SKF_AD_PKTTYPE:
+		insn->code = BPF_LDX | BPF_MEM | BPF_B;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = pkt_type_offset();
+		if (insn->off < 0)
+			return false;
+		insn++;
+
+		insn->code = BPF_ALU | BPF_AND | BPF_K;
+		insn->a_reg = A_REG;
+		insn->imm = PKT_TYPE_MAX;
+		break;
+
+	case SKF_AD_OFF + SKF_AD_IFINDEX:
+	case SKF_AD_OFF + SKF_AD_HATYPE:
+		if (FIELD_SIZEOF(struct sk_buff, dev) == 8)
+			insn->code = BPF_LDX | BPF_MEM | BPF_DW;
+		else
+			insn->code = BPF_LDX | BPF_MEM | BPF_W;
+		insn->a_reg = TMP_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, dev);
+		insn++;
+
+		insn->code = BPF_JMP | BPF_JNE | BPF_K;
+		insn->a_reg = TMP_REG;
+		insn->imm = 0;
+		insn->off = 1;
+		insn++;
+
+		insn->code = BPF_JMP | BPF_EXIT;
+		insn++;
+
+		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
+		BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+
+		insn->a_reg = A_REG;
+		insn->x_reg = TMP_REG;
+
+		if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX) {
+			insn->code = BPF_LDX | BPF_MEM | BPF_W;
+			insn->off = offsetof(struct net_device, ifindex);
+		} else {
+			insn->code = BPF_LDX | BPF_MEM | BPF_H;
+			insn->off = offsetof(struct net_device, type);
+		}
+		break;
+
+	case SKF_AD_OFF + SKF_AD_MARK:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+
+		insn->code = BPF_LDX | BPF_MEM | BPF_W;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, mark);
+		break;
+
+	case SKF_AD_OFF + SKF_AD_RXHASH:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+
+		insn->code = BPF_LDX | BPF_MEM | BPF_W;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, hash);
+		break;
+
+	case SKF_AD_OFF + SKF_AD_QUEUE:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+
+		insn->code = BPF_LDX | BPF_MEM | BPF_H;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, queue_mapping);
+		break;
+
+	case SKF_AD_OFF + SKF_AD_VLAN_TAG:
+	case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
+
+		insn->code = BPF_LDX | BPF_MEM | BPF_H;
+		insn->a_reg = A_REG;
+		insn->x_reg = CTX_REG;
+		insn->off = offsetof(struct sk_buff, vlan_tci);
+		insn++;
+
+		BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+
+		if (fp->k == SKF_AD_OFF + SKF_AD_VLAN_TAG) {
+			insn->code = BPF_ALU | BPF_AND | BPF_K;
+			insn->a_reg = A_REG;
+			insn->imm = ~VLAN_TAG_PRESENT;
+		} else {
+			insn->code = BPF_ALU | BPF_RSH | BPF_K;
+			insn->a_reg = A_REG;
+			insn->imm = 12;
+			insn++;
+
+			insn->code = BPF_ALU | BPF_AND | BPF_K;
+			insn->a_reg = A_REG;
+			insn->imm = 1;
+		}
+		break;
+
+	case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+	case SKF_AD_OFF + SKF_AD_NLATTR:
+	case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+	case SKF_AD_OFF + SKF_AD_CPU:
+		/* arg1 = ctx */
+		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+		insn->a_reg = ARG1_REG;
+		insn->x_reg = CTX_REG;
+		insn++;
+
+		/* arg2 = A */
+		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+		insn->a_reg = ARG2_REG;
+		insn->x_reg = A_REG;
+		insn++;
+
+		/* arg3 = X */
+		insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+		insn->a_reg = ARG3_REG;
+		insn->x_reg = X_REG;
+		insn++;
+
+		/* Emit call(ctx, arg2=A, arg3=X) */
+		insn->code = BPF_JMP | BPF_CALL;
+		switch (fp->k) {
+		case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
+			insn->imm = __skb_get_pay_offset - __bpf_call_base;
+			break;
+		case SKF_AD_OFF + SKF_AD_NLATTR:
+			insn->imm = __skb_get_nlattr - __bpf_call_base;
+			break;
+		case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
+			insn->imm = __skb_get_nlattr_nest - __bpf_call_base;
+			break;
+		case SKF_AD_OFF + SKF_AD_CPU:
+			insn->imm = __get_raw_cpu_id - __bpf_call_base;
+			break;
+		}
+		break;
+
+	case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
+		insn->code = BPF_ALU | BPF_XOR | BPF_X;
+		insn->a_reg = A_REG;
+		insn->x_reg = X_REG;
+		break;
+
+	default:
+		/* This is just a dummy call to avoid letting the compiler
+		 * evict __bpf_call_base() as an optimization. Placed here
+		 * where no-one bothers.
+		 */
+		BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
+		return false;
+	}
+
+	*insnp = insn;
+	return true;
+}
+
+/**
+ *	sk_convert_filter - convert filter program
+ *	@prog: the user passed filter program
+ *	@len: the length of the user passed filter program
+ *	@new_prog: buffer where converted program will be stored
+ *	@new_len: pointer to store length of converted program
+ *
+ * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style.
+ * Conversion workflow:
+ *
+ * 1) First pass for calculating the new program length:
+ *   sk_convert_filter(old_prog, old_len, NULL, &new_len)
+ *
+ * 2) 2nd pass to remap in two passes: 1st pass finds new
+ *    jump offsets, 2nd pass remapping:
+ *   new_prog = kmalloc(sizeof(struct sock_filter_int) * new_len);
+ *   sk_convert_filter(old_prog, old_len, new_prog, &new_len);
+ *
+ * User BPF's register A is mapped to our BPF register 6, user BPF
+ * register X is mapped to BPF register 7; frame pointer is always
+ * register 10; Context 'void *ctx' is stored in register 1, that is,
+ * for socket filters: ctx == 'struct sk_buff *', for seccomp:
+ * ctx == 'struct seccomp_data *'.
+ */
+int sk_convert_filter(struct sock_filter *prog, int len,
+		      struct sock_filter_int *new_prog, int *new_len)
+{
+	int new_flen = 0, pass = 0, target, i;
+	struct sock_filter_int *new_insn;
+	struct sock_filter *fp;
+	int *addrs = NULL;
+	u8 bpf_src;
+
+	BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
+	BUILD_BUG_ON(FP_REG + 1 != MAX_BPF_REG);
+
+	if (len <= 0 || len >= BPF_MAXINSNS)
+		return -EINVAL;
+
+	if (new_prog) {
+		addrs = kzalloc(len * sizeof(*addrs), GFP_KERNEL);
+		if (!addrs)
+			return -ENOMEM;
+	}
+
+do_pass:
+	new_insn = new_prog;
+	fp = prog;
+
+	if (new_insn) {
+		new_insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+		new_insn->a_reg = CTX_REG;
+		new_insn->x_reg = ARG1_REG;
+	}
+	new_insn++;
+
+	for (i = 0; i < len; fp++, i++) {
+		struct sock_filter_int tmp_insns[6] = { };
+		struct sock_filter_int *insn = tmp_insns;
+
+		if (addrs)
+			addrs[i] = new_insn - new_prog;
+
+		switch (fp->code) {
+		/* All arithmetic insns and skb loads map as-is. */
+		case BPF_ALU | BPF_ADD | BPF_X:
+		case BPF_ALU | BPF_ADD | BPF_K:
+		case BPF_ALU | BPF_SUB | BPF_X:
+		case BPF_ALU | BPF_SUB | BPF_K:
+		case BPF_ALU | BPF_AND | BPF_X:
+		case BPF_ALU | BPF_AND | BPF_K:
+		case BPF_ALU | BPF_OR | BPF_X:
+		case BPF_ALU | BPF_OR | BPF_K:
+		case BPF_ALU | BPF_LSH | BPF_X:
+		case BPF_ALU | BPF_LSH | BPF_K:
+		case BPF_ALU | BPF_RSH | BPF_X:
+		case BPF_ALU | BPF_RSH | BPF_K:
+		case BPF_ALU | BPF_XOR | BPF_X:
+		case BPF_ALU | BPF_XOR | BPF_K:
+		case BPF_ALU | BPF_MUL | BPF_X:
+		case BPF_ALU | BPF_MUL | BPF_K:
+		case BPF_ALU | BPF_DIV | BPF_X:
+		case BPF_ALU | BPF_DIV | BPF_K:
+		case BPF_ALU | BPF_MOD | BPF_X:
+		case BPF_ALU | BPF_MOD | BPF_K:
+		case BPF_ALU | BPF_NEG:
+		case BPF_LD | BPF_ABS | BPF_W:
+		case BPF_LD | BPF_ABS | BPF_H:
+		case BPF_LD | BPF_ABS | BPF_B:
+		case BPF_LD | BPF_IND | BPF_W:
+		case BPF_LD | BPF_IND | BPF_H:
+		case BPF_LD | BPF_IND | BPF_B:
+			/* Check for overloaded BPF extension and
+			 * directly convert it if found, otherwise
+			 * just move on with mapping.
+			 */
+			if (BPF_CLASS(fp->code) == BPF_LD &&
+			    BPF_MODE(fp->code) == BPF_ABS &&
+			    convert_bpf_extensions(fp, &insn))
+				break;
+
+			insn->code = fp->code;
+			insn->a_reg = A_REG;
+			insn->x_reg = X_REG;
+			insn->imm = fp->k;
+			break;
+
+		/* Jump opcodes map as-is, but offsets need adjustment. */
+		case BPF_JMP | BPF_JA:
+			target = i + fp->k + 1;
+			insn->code = fp->code;
+#define EMIT_JMP							\
+	do {								\
+		if (target >= len || target < 0)			\
+			goto err;					\
+		insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0;	\
+		/* Adjust pc relative offset for 2nd or 3rd insn. */	\
+		insn->off -= insn - tmp_insns;				\
+	} while (0)
+
+			EMIT_JMP;
+			break;
+
+		case BPF_JMP | BPF_JEQ | BPF_K:
+		case BPF_JMP | BPF_JEQ | BPF_X:
+		case BPF_JMP | BPF_JSET | BPF_K:
+		case BPF_JMP | BPF_JSET | BPF_X:
+		case BPF_JMP | BPF_JGT | BPF_K:
+		case BPF_JMP | BPF_JGT | BPF_X:
+		case BPF_JMP | BPF_JGE | BPF_K:
+		case BPF_JMP | BPF_JGE | BPF_X:
+			if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
+				/* BPF immediates are signed, zero extend
+				 * immediate into tmp register and use it
+				 * in compare insn.
+				 */
+				insn->code = BPF_ALU | BPF_MOV | BPF_K;
+				insn->a_reg = TMP_REG;
+				insn->imm = fp->k;
+				insn++;
+
+				insn->a_reg = A_REG;
+				insn->x_reg = TMP_REG;
+				bpf_src = BPF_X;
+			} else {
+				insn->a_reg = A_REG;
+				insn->x_reg = X_REG;
+				insn->imm = fp->k;
+				bpf_src = BPF_SRC(fp->code);
 			}
-			return 0;
-		case BPF_S_LD_B_ABS:
-			k = K;
-load_b:
-			ptr = load_pointer(skb, k, 1, &tmp);
-			if (ptr != NULL) {
-				A = *(u8 *)ptr;
-				continue;
+
+			/* Common case where 'jump_false' is next insn. */
+			if (fp->jf == 0) {
+				insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+				target = i + fp->jt + 1;
+				EMIT_JMP;
+				break;
 			}
-			return 0;
-		case BPF_S_LD_W_LEN:
-			A = skb->len;
-			continue;
-		case BPF_S_LDX_W_LEN:
-			X = skb->len;
-			continue;
-		case BPF_S_LD_W_IND:
-			k = X + K;
-			goto load_w;
-		case BPF_S_LD_H_IND:
-			k = X + K;
-			goto load_h;
-		case BPF_S_LD_B_IND:
-			k = X + K;
-			goto load_b;
-		case BPF_S_LDX_B_MSH:
-			ptr = load_pointer(skb, K, 1, &tmp);
-			if (ptr != NULL) {
-				X = (*(u8 *)ptr & 0xf) << 2;
-				continue;
+
+			/* Convert JEQ into JNE when 'jump_true' is next insn. */
+			if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) {
+				insn->code = BPF_JMP | BPF_JNE | bpf_src;
+				target = i + fp->jf + 1;
+				EMIT_JMP;
+				break;
 			}
-			return 0;
-		case BPF_S_LD_IMM:
-			A = K;
-			continue;
-		case BPF_S_LDX_IMM:
-			X = K;
-			continue;
-		case BPF_S_LD_MEM:
-			A = mem[K];
-			continue;
-		case BPF_S_LDX_MEM:
-			X = mem[K];
-			continue;
-		case BPF_S_MISC_TAX:
-			X = A;
-			continue;
-		case BPF_S_MISC_TXA:
-			A = X;
-			continue;
-		case BPF_S_RET_K:
-			return K;
-		case BPF_S_RET_A:
-			return A;
-		case BPF_S_ST:
-			mem[K] = A;
-			continue;
-		case BPF_S_STX:
-			mem[K] = X;
-			continue;
-		case BPF_S_ANC_PROTOCOL:
-			A = ntohs(skb->protocol);
-			continue;
-		case BPF_S_ANC_PKTTYPE:
-			A = skb->pkt_type;
-			continue;
-		case BPF_S_ANC_IFINDEX:
-			if (!skb->dev)
-				return 0;
-			A = skb->dev->ifindex;
-			continue;
-		case BPF_S_ANC_MARK:
-			A = skb->mark;
-			continue;
-		case BPF_S_ANC_QUEUE:
-			A = skb->queue_mapping;
-			continue;
-		case BPF_S_ANC_HATYPE:
-			if (!skb->dev)
-				return 0;
-			A = skb->dev->type;
-			continue;
-		case BPF_S_ANC_RXHASH:
-			A = skb->hash;
-			continue;
-		case BPF_S_ANC_CPU:
-			A = raw_smp_processor_id();
-			continue;
-		case BPF_S_ANC_VLAN_TAG:
-			A = vlan_tx_tag_get(skb);
-			continue;
-		case BPF_S_ANC_VLAN_TAG_PRESENT:
-			A = !!vlan_tx_tag_present(skb);
-			continue;
-		case BPF_S_ANC_PAY_OFFSET:
-			A = __skb_get_poff(skb);
-			continue;
-		case BPF_S_ANC_NLATTR: {
-			struct nlattr *nla;
-
-			if (skb_is_nonlinear(skb))
-				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
-				return 0;
-
-			nla = nla_find((struct nlattr *)&skb->data[A],
-				       skb->len - A, X);
-			if (nla)
-				A = (void *)nla - (void *)skb->data;
-			else
-				A = 0;
-			continue;
-		}
-		case BPF_S_ANC_NLATTR_NEST: {
-			struct nlattr *nla;
-
-			if (skb_is_nonlinear(skb))
-				return 0;
-			if (A > skb->len - sizeof(struct nlattr))
-				return 0;
-
-			nla = (struct nlattr *)&skb->data[A];
-			if (nla->nla_len > A - skb->len)
-				return 0;
-
-			nla = nla_find_nested(nla, X);
-			if (nla)
-				A = (void *)nla - (void *)skb->data;
-			else
-				A = 0;
-			continue;
-		}
-#ifdef CONFIG_SECCOMP_FILTER
-		case BPF_S_ANC_SECCOMP_LD_W:
-			A = seccomp_bpf_load(fentry->k);
-			continue;
-#endif
+
+			/* Other jumps are mapped into two insns: Jxx and JA. */
+			target = i + fp->jt + 1;
+			insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
+			EMIT_JMP;
+			insn++;
+
+			insn->code = BPF_JMP | BPF_JA;
+			target = i + fp->jf + 1;
+			EMIT_JMP;
+			break;
+
+		/* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+		case BPF_LDX | BPF_MSH | BPF_B:
+			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+			insn->a_reg = TMP_REG;
+			insn->x_reg = A_REG;
+			insn++;
+
+			insn->code = BPF_LD | BPF_ABS | BPF_B;
+			insn->a_reg = A_REG;
+			insn->imm = fp->k;
+			insn++;
+
+			insn->code = BPF_ALU | BPF_AND | BPF_K;
+			insn->a_reg = A_REG;
+			insn->imm = 0xf;
+			insn++;
+
+			insn->code = BPF_ALU | BPF_LSH | BPF_K;
+			insn->a_reg = A_REG;
+			insn->imm = 2;
+			insn++;
+
+			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+			insn->a_reg = X_REG;
+			insn->x_reg = A_REG;
+			insn++;
+
+			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+			insn->a_reg = A_REG;
+			insn->x_reg = TMP_REG;
+			break;
+
+		/* RET_K, RET_A are remaped into 2 insns. */
+		case BPF_RET | BPF_A:
+		case BPF_RET | BPF_K:
+			insn->code = BPF_ALU | BPF_MOV |
+				     (BPF_RVAL(fp->code) == BPF_K ?
+				      BPF_K : BPF_X);
+			insn->a_reg = 0;
+			insn->x_reg = A_REG;
+			insn->imm = fp->k;
+			insn++;
+
+			insn->code = BPF_JMP | BPF_EXIT;
+			break;
+
+		/* Store to stack. */
+		case BPF_ST:
+		case BPF_STX:
+			insn->code = BPF_STX | BPF_MEM | BPF_W;
+			insn->a_reg = FP_REG;
+			insn->x_reg = fp->code == BPF_ST ? A_REG : X_REG;
+			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			break;
+
+		/* Load from stack. */
+		case BPF_LD | BPF_MEM:
+		case BPF_LDX | BPF_MEM:
+			insn->code = BPF_LDX | BPF_MEM | BPF_W;
+			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
+				      A_REG : X_REG;
+			insn->x_reg = FP_REG;
+			insn->off = -(BPF_MEMWORDS - fp->k) * 4;
+			break;
+
+		/* A = K or X = K */
+		case BPF_LD | BPF_IMM:
+		case BPF_LDX | BPF_IMM:
+			insn->code = BPF_ALU | BPF_MOV | BPF_K;
+			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
+				      A_REG : X_REG;
+			insn->imm = fp->k;
+			break;
+
+		/* X = A */
+		case BPF_MISC | BPF_TAX:
+			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+			insn->a_reg = X_REG;
+			insn->x_reg = A_REG;
+			break;
+
+		/* A = X */
+		case BPF_MISC | BPF_TXA:
+			insn->code = BPF_ALU64 | BPF_MOV | BPF_X;
+			insn->a_reg = A_REG;
+			insn->x_reg = X_REG;
+			break;
+
+		/* A = skb->len or X = skb->len */
+		case BPF_LD | BPF_W | BPF_LEN:
+		case BPF_LDX | BPF_W | BPF_LEN:
+			insn->code = BPF_LDX | BPF_MEM | BPF_W;
+			insn->a_reg = BPF_CLASS(fp->code) == BPF_LD ?
+				      A_REG : X_REG;
+			insn->x_reg = CTX_REG;
+			insn->off = offsetof(struct sk_buff, len);
+			break;
+
+		/* access seccomp_data fields */
+		case BPF_LDX | BPF_ABS | BPF_W:
+			insn->code = BPF_LDX | BPF_MEM | BPF_W;
+			insn->a_reg = A_REG;
+			insn->x_reg = CTX_REG;
+			insn->off = fp->k;
+			break;
+
 		default:
-			WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n",
-				       fentry->code, fentry->jt,
-				       fentry->jf, fentry->k);
-			return 0;
+			goto err;
 		}
+
+		insn++;
+		if (new_prog)
+			memcpy(new_insn, tmp_insns,
+			       sizeof(*insn) * (insn - tmp_insns));
+
+		new_insn += insn - tmp_insns;
 	}
 
+	if (!new_prog) {
+		/* Only calculating new length. */
+		*new_len = new_insn - new_prog;
+		return 0;
+	}
+
+	pass++;
+	if (new_flen != new_insn - new_prog) {
+		new_flen = new_insn - new_prog;
+		if (pass > 2)
+			goto err;
+
+		goto do_pass;
+	}
+
+	kfree(addrs);
+	BUG_ON(*new_len != new_flen);
 	return 0;
+err:
+	kfree(addrs);
+	return -EINVAL;
 }
-EXPORT_SYMBOL(sk_run_filter);
 
-/*
- * Security :
+/* Security:
+ *
  * A BPF program is able to use 16 cells of memory to store intermediate
- * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter())
+ * values (check u32 mem[BPF_MEMWORDS] in sk_run_filter()).
+ *
  * As we dont want to clear mem[] array for each packet going through
  * sk_run_filter(), we check that filter loaded by user never try to read
  * a cell if not previously written, and we check all branches to be sure
@@ -696,19 +1442,130 @@ void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
 	atomic_add(sk_filter_size(fp->len), &sk->sk_omem_alloc);
 }
 
-static int __sk_prepare_filter(struct sk_filter *fp)
+static struct sk_filter *__sk_migrate_realloc(struct sk_filter *fp,
+					      struct sock *sk,
+					      unsigned int len)
+{
+	struct sk_filter *fp_new;
+
+	if (sk == NULL)
+		return krealloc(fp, len, GFP_KERNEL);
+
+	fp_new = sock_kmalloc(sk, len, GFP_KERNEL);
+	if (fp_new) {
+		memcpy(fp_new, fp, sizeof(struct sk_filter));
+		/* As we're kepping orig_prog in fp_new along,
+		 * we need to make sure we're not evicting it
+		 * from the old fp.
+		 */
+		fp->orig_prog = NULL;
+		sk_filter_uncharge(sk, fp);
+	}
+
+	return fp_new;
+}
+
+static struct sk_filter *__sk_migrate_filter(struct sk_filter *fp,
+					     struct sock *sk)
+{
+	struct sock_filter *old_prog;
+	struct sk_filter *old_fp;
+	int i, err, new_len, old_len = fp->len;
+
+	/* We are free to overwrite insns et al right here as it
+	 * won't be used at this point in time anymore internally
+	 * after the migration to the internal BPF instruction
+	 * representation.
+	 */
+	BUILD_BUG_ON(sizeof(struct sock_filter) !=
+		     sizeof(struct sock_filter_int));
+
+	/* For now, we need to unfiddle BPF_S_* identifiers in place.
+	 * This can sooner or later on be subject to removal, e.g. when
+	 * JITs have been converted.
+	 */
+	for (i = 0; i < fp->len; i++)
+		sk_decode_filter(&fp->insns[i], &fp->insns[i]);
+
+	/* Conversion cannot happen on overlapping memory areas,
+	 * so we need to keep the user BPF around until the 2nd
+	 * pass. At this time, the user BPF is stored in fp->insns.
+	 */
+	old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
+			   GFP_KERNEL);
+	if (!old_prog) {
+		err = -ENOMEM;
+		goto out_err;
+	}
+
+	/* 1st pass: calculate the new program length. */
+	err = sk_convert_filter(old_prog, old_len, NULL, &new_len);
+	if (err)
+		goto out_err_free;
+
+	/* Expand fp for appending the new filter representation. */
+	old_fp = fp;
+	fp = __sk_migrate_realloc(old_fp, sk, sk_filter_size(new_len));
+	if (!fp) {
+		/* The old_fp is still around in case we couldn't
+		 * allocate new memory, so uncharge on that one.
+		 */
+		fp = old_fp;
+		err = -ENOMEM;
+		goto out_err_free;
+	}
+
+	fp->bpf_func = sk_run_filter_int_skb;
+	fp->len = new_len;
+
+	/* 2nd pass: remap sock_filter insns into sock_filter_int insns. */
+	err = sk_convert_filter(old_prog, old_len, fp->insnsi, &new_len);
+	if (err)
+		/* 2nd sk_convert_filter() can fail only if it fails
+		 * to allocate memory, remapping must succeed. Note,
+		 * that at this time old_fp has already been released
+		 * by __sk_migrate_realloc().
+		 */
+		goto out_err_free;
+
+	kfree(old_prog);
+	return fp;
+
+out_err_free:
+	kfree(old_prog);
+out_err:
+	/* Rollback filter setup. */
+	if (sk != NULL)
+		sk_filter_uncharge(sk, fp);
+	else
+		kfree(fp);
+	return ERR_PTR(err);
+}
+
+static struct sk_filter *__sk_prepare_filter(struct sk_filter *fp,
+					     struct sock *sk)
 {
 	int err;
 
-	fp->bpf_func = sk_run_filter;
+	fp->bpf_func = NULL;
 	fp->jited = 0;
 
 	err = sk_chk_filter(fp->insns, fp->len);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 
+	/* Probe if we can JIT compile the filter and if so, do
+	 * the compilation of the filter.
+	 */
 	bpf_jit_compile(fp);
-	return 0;
+
+	/* JIT compiler couldn't process this filter, so do the
+	 * internal BPF translation for the optimized interpreter.
+	 */
+	if (!fp->jited)
+		fp = __sk_migrate_filter(fp, sk);
+
+	return fp;
 }
 
 /**
@@ -726,7 +1583,6 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
 {
 	unsigned int fsize = sk_filter_proglen(fprog);
 	struct sk_filter *fp;
-	int err;
 
 	/* Make sure new filter is there and in the right amounts. */
 	if (fprog->filter == NULL)
@@ -746,15 +1602,15 @@ int sk_unattached_filter_create(struct sk_filter **pfp,
 	 */
 	fp->orig_prog = NULL;
 
-	err = __sk_prepare_filter(fp);
-	if (err)
-		goto free_mem;
+	/* __sk_prepare_filter() already takes care of uncharging
+	 * memory in case something goes wrong.
+	 */
+	fp = __sk_prepare_filter(fp, NULL);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
 
 	*pfp = fp;
 	return 0;
-free_mem:
-	kfree(fp);
-	return err;
 }
 EXPORT_SYMBOL_GPL(sk_unattached_filter_create);
 
@@ -806,11 +1662,12 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 		return -ENOMEM;
 	}
 
-	err = __sk_prepare_filter(fp);
-	if (err) {
-		sk_filter_uncharge(sk, fp);
-		return err;
-	}
+	/* __sk_prepare_filter() already takes care of uncharging
+	 * memory in case something goes wrong.
+	 */
+	fp = __sk_prepare_filter(fp, sk);
+	if (IS_ERR(fp))
+		return PTR_ERR(fp);
 
 	old_fp = rcu_dereference_protected(sk->sk_filter,
 					   sock_owned_by_user(sk));
-- 
cgit v1.3-14-g43fede


From 543bc6a1a987672b79d6ebe8e2ab10471d8f1047 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 30 Mar 2014 19:07:54 -0400
Subject: AUDIT: Allow login in non-init namespaces

It its possible to configure your PAM stack to refuse login if audit
messages (about the login) were unable to be sent.  This is common in
many distros and thus normal configuration of many containers.  The PAM
modules determine if audit is enabled/disabled in the kernel based on
the return value from sending an audit message on the netlink socket.
If userspace gets back ECONNREFUSED it believes audit is disabled in the
kernel.  If it gets any other error else it refuses to let the login
proceed.

Just about ever since the introduction of namespaces the kernel audit
subsystem has returned EPERM if the task sending a message was not in
the init user or pid namespace.  So many forms of containers have never
worked if audit was enabled in the kernel.

BUT if the container was not in net_init then the kernel network code
would send ECONNREFUSED (instead of the audit code sending EPERM).  Thus
by pure accident/dumb luck/bug if an admin configured the PAM stack to
reject all logins that didn't talk to audit, but then ran the login
untility in the non-init_net namespace, it would work!! Clearly this was
a bug, but it is a bug some people expected.

With the introduction of network namespace support in 3.14-rc1 the two
bugs stopped cancelling each other out.  Now, containers in the
non-init_net namespace refused to let users log in (just like PAM was
configfured!) Obviously some people were not happy that what used to let
users log in, now didn't!

This fix is kinda hacky.  We return ECONNREFUSED for all non-init
relevant namespaces.  That means that not only will the old broken
non-init_net setups continue to work, now the broken non-init_pid or
non-init_user setups will 'work'.  They don't really work, since audit
isn't logging things.  But it's what most users want.

In 3.15 we should have patches to support not only the non-init_net
(3.14) namespace but also the non-init_pid and non-init_user namespace.
So all will be right in the world.  This just opens the doors wide open
on 3.14 and hopefully makes users happy, if not the audit system...

Reported-by: Andre Tomt <andre@tomt.net>
Reported-by: Adam Richter <adam_richter2004@yahoo.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Conflicts:
	kernel/audit.c
---
 kernel/audit.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ad77d1e80895..873b965fdc58 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -608,8 +608,18 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	int err = 0;
 
 	/* Only support initial user namespace for now. */
+	/*
+	 * We return ECONNREFUSED because it tricks userspace into thinking
+	 * that audit was not configured into the kernel.  Lots of users
+	 * configure their PAM stack (because that's what the distro does)
+	 * to reject login if unable to send messages to audit.  If we return
+	 * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+	 * configured in and will let login proceed.  If we return EPERM
+	 * userspace will reject all logins.  This should be removed when we
+	 * support non init namespaces!!
+	 */
 	if ((current_user_ns() != &init_user_ns))
-		return -EPERM;
+		return -ECONNREFUSED;
 
 	switch (msg_type) {
 	case AUDIT_LIST:
-- 
cgit v1.3-14-g43fede


From fbb32750a62df75d1ffea547f3908b21c5496d9f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 2 Feb 2014 21:09:54 -0500
Subject: pipe: kill ->map() and ->unmap()

all pipe_buffer_operations have the same instances of those...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/virtio_console.c |  4 +--
 fs/fuse/dev.c                 |  6 ++--
 fs/pipe.c                     | 70 ++++++++++---------------------------------
 fs/splice.c                   | 24 +++++----------
 include/linux/pipe_fs_i.h     | 19 ------------
 kernel/relay.c                |  2 --
 kernel/trace/trace.c          |  4 ---
 7 files changed, 29 insertions(+), 100 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 6928d094451d..60aafb8a1f2e 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -901,9 +901,9 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		if (len + offset > PAGE_SIZE)
 			len = PAGE_SIZE - offset;
 
-		src = buf->ops->map(pipe, buf, 1);
+		src = kmap_atomic(buf->page);
 		memcpy(page_address(page) + offset, src + buf->offset, len);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 
 		sg_set_page(&(sgl->sg[sgl->n]), page, len, offset);
 	}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f90af6fe6884..aac71ce373e4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -667,7 +667,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
 		struct pipe_buffer *buf = cs->currbuf;
 
 		if (!cs->write) {
-			buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+			kunmap_atomic(cs->mapaddr);
 		} else {
 			kunmap_atomic(cs->mapaddr);
 			buf->len = PAGE_SIZE - cs->len;
@@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 
 			BUG_ON(!cs->nr_segs);
 			cs->currbuf = buf;
-			cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+			cs->mapaddr = kmap_atomic(buf->page);
 			cs->len = buf->len;
 			cs->buf = cs->mapaddr + buf->offset;
 			cs->pipebufs++;
@@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
 out_fallback_unlock:
 	unlock_page(newpage);
 out_fallback:
-	cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+	cs->mapaddr = kmap_atomic(buf->page);
 	cs->buf = cs->mapaddr + buf->offset;
 
 	err = lock_request(cs->fc, cs->req);
diff --git a/fs/pipe.c b/fs/pipe.c
index 78fd0d0788db..6679c95eb4c3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -225,52 +225,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 		page_cache_release(page);
 }
 
-/**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe:	the pipe that the buffer belongs to
- * @buf:	the buffer that should be mapped
- * @atomic:	whether to use an atomic map
- *
- * Description:
- *	This function returns a kernel virtual address mapping for the
- *	pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- *	and the caller has to be careful not to fault before calling
- *	the unmap function.
- *
- *	Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
-			   struct pipe_buffer *buf, int atomic)
-{
-	if (atomic) {
-		buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-		return kmap_atomic(buf->page);
-	}
-
-	return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe:	the pipe that the buffer belongs to
- * @buf:	the buffer that should be unmapped
- * @map_data:	the data that the mapping function returned
- *
- * Description:
- *	This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
-			    struct pipe_buffer *buf, void *map_data)
-{
-	if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
-		buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-		kunmap_atomic(map_data);
-	} else
-		kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-
 /**
  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
  * @pipe:	the pipe that the buffer belongs to
@@ -351,8 +305,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
 
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -361,8 +313,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 
 static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -410,9 +360,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 
 			atomic = !iov_fault_in_pages_write(iov, chars);
 redo:
-			addr = ops->map(pipe, buf, atomic);
+			if (atomic)
+				addr = kmap_atomic(buf->page);
+			else
+				addr = kmap(buf->page);
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
-			ops->unmap(pipe, buf, addr);
+			if (atomic)
+				kunmap_atomic(addr);
+			else
+				kunmap(buf->page);
 			if (unlikely(error)) {
 				/*
 				 * Just retry with the slow path if we failed.
@@ -538,10 +494,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 
 			iov_fault_in_pages_read(iov, chars);
 redo1:
-			addr = ops->map(pipe, buf, atomic);
+			if (atomic)
+				addr = kmap_atomic(buf->page);
+			else
+				addr = kmap(buf->page);
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars, atomic);
-			ops->unmap(pipe, buf, addr);
+			if (atomic)
+				kunmap_atomic(addr);
+			else
+				kunmap(buf->page);
 			ret = error;
 			do_wakeup = 1;
 			if (error) {
diff --git a/fs/splice.c b/fs/splice.c
index 12028fa41def..ca3bfbd970f3 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -136,8 +136,6 @@ error:
 
 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = page_cache_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
@@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
 
 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = page_cache_pipe_buf_release,
 	.steal = user_page_pipe_buf_steal,
@@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read);
 
 static const struct pipe_buf_operations default_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
@@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a socket and similar. */
 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = generic_pipe_buf_release,
 	.steal = generic_pipe_buf_nosteal,
@@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 		goto out;
 
 	if (buf->page != page) {
-		char *src = buf->ops->map(pipe, buf, 1);
+		char *src = kmap_atomic(buf->page);
 		char *dst = kmap_atomic(page);
 
 		memcpy(dst + offset, src + buf->offset, this_len);
 		flush_dcache_page(page);
 		kunmap_atomic(dst);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 	}
 	ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
 				page, fsdata);
@@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	void *data;
 	loff_t tmp = sd->pos;
 
-	data = buf->ops->map(pipe, buf, 0);
+	data = kmap(buf->page);
 	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
-	buf->ops->unmap(pipe, buf, data);
+	kunmap(buf->page);
 
 	return ret;
 }
@@ -1536,10 +1528,10 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	 * pages and doing an atomic copy
 	 */
 	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
-		src = buf->ops->map(pipe, buf, 1);
+		src = kmap_atomic(buf->page);
 		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
 							sd->len);
-		buf->ops->unmap(pipe, buf, src);
+		kunmap_atomic(src);
 		if (!ret) {
 			ret = sd->len;
 			goto out;
@@ -1549,13 +1541,13 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 	/*
 	 * No dice, use slow non-atomic map and copy
  	 */
-	src = buf->ops->map(pipe, buf, 0);
+	src = kmap(buf->page);
 
 	ret = sd->len;
 	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
 		ret = -EFAULT;
 
-	buf->ops->unmap(pipe, buf, src);
+	kunmap(buf->page);
 out:
 	if (ret > 0)
 		sd->u.userptr += ret;
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ab5752692113..6dffcebe6105 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -82,23 +82,6 @@ struct pipe_buf_operations {
 	 */
 	int can_merge;
 
-	/*
-	 * ->map() returns a virtual address mapping of the pipe buffer.
-	 * The last integer flag reflects whether this should be an atomic
-	 * mapping or not. The atomic map is faster, however you can't take
-	 * page faults before calling ->unmap() again. So if you need to eg
-	 * access user data through copy_to/from_user(), then you must get
-	 * a non-atomic map. ->map() uses the kmap_atomic slot for
-	 * atomic maps, you have to be careful if mapping another page as
-	 * source or destination for a copy.
-	 */
-	void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int);
-
-	/*
-	 * Undoes ->map(), finishes the virtual mapping of the pipe buffer.
-	 */
-	void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *);
-
 	/*
 	 * ->confirm() verifies that the data in the pipe buffer is there
 	 * and that the contents are good. If the pages in the pipe belong
@@ -150,8 +133,6 @@ struct pipe_inode_info *alloc_pipe_info(void);
 void free_pipe_info(struct pipe_inode_info *);
 
 /* Generic pipe buffer ops functions */
-void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
-void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *);
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
diff --git a/kernel/relay.c b/kernel/relay.c
index 5001c9887db1..98833f664fb6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
 
 static const struct pipe_buf_operations relay_pipe_buf_ops = {
 	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = relay_pipe_buf_release,
 	.steal = generic_pipe_buf_steal,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 24c1f2382557..7511de35257f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4316,8 +4316,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 
 static const struct pipe_buf_operations tracing_pipe_buf_ops = {
 	.can_merge		= 0,
-	.map			= generic_pipe_buf_map,
-	.unmap			= generic_pipe_buf_unmap,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= generic_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
@@ -5194,8 +5192,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
 /* Pipe buffer operations for a buffer. */
 static const struct pipe_buf_operations buffer_pipe_buf_ops = {
 	.can_merge		= 0,
-	.map			= generic_pipe_buf_map,
-	.unmap			= generic_pipe_buf_unmap,
 	.confirm		= generic_pipe_buf_confirm,
 	.release		= buffer_pipe_buf_release,
 	.steal			= generic_pipe_buf_steal,
-- 
cgit v1.3-14-g43fede


From 56c4911aedbecc2bdf7940073e85d52b691e2509 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 2 Apr 2014 15:46:42 -0400
Subject: audit: do not cast audit_rule_data pointers pointlesly

For some sort of legacy support audit_rule is a subset of (and first
entry in) audit_rule_data.  We don't actually need or use audit_rule.
We just do a cast from one to the other for no gain what so ever.  Stop
the crazy casting.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 kernel/auditfilter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 96c8a704f130..70101e0b184a 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -228,7 +228,7 @@ static int audit_match_signal(struct audit_entry *entry)
 #endif
 
 /* Common user-space to kernel rule translation. */
-static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
+static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
 {
 	unsigned listnr;
 	struct audit_entry *entry;
@@ -405,7 +405,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 	int i;
 	char *str;
 
-	entry = audit_to_entry_common((struct audit_rule *)data);
+	entry = audit_to_entry_common(data);
 	if (IS_ERR(entry))
 		goto exit_nofree;
 
-- 
cgit v1.3-14-g43fede


From d23082257d83e4bc89727d5aedee197e907999d2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 2 Apr 2014 17:45:05 +0200
Subject: pid_namespace: pidns_get() should check task_active_pid_ns() != NULL

pidns_get()->get_pid_ns() can hit ns == NULL. This task_struct can't
go away, but task_active_pid_ns(task) is NULL if release_task(task)
was already called. Alternatively we could change get_pid_ns(ns) to
check ns != NULL, but it seems that other callers are fine.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Eric W. Biederman ebiederm@xmission.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 06c62de9c711..db95d8eb761b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -318,7 +318,9 @@ static void *pidns_get(struct task_struct *task)
 	struct pid_namespace *ns;
 
 	rcu_read_lock();
-	ns = get_pid_ns(task_active_pid_ns(task));
+	ns = task_active_pid_ns(task);
+	if (ns)
+		get_pid_ns(ns);
 	rcu_read_unlock();
 
 	return ns;
-- 
cgit v1.3-14-g43fede


From 81c98869faa5f3a9457c93efef908ef476326b31 Mon Sep 17 00:00:00 2001
From: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Date: Thu, 3 Apr 2014 14:46:25 -0700
Subject: kthread: ensure locality of task_struct allocations

In the presence of memoryless nodes, numa_node_id() will return the
current CPU's NUMA node, but that may not be where we expect to allocate
from memory from.  Instead, we should rely on the fallback code in the
memory allocator itself, by using NUMA_NO_NODE.  Also, when calling
kthread_create_on_node(), use the nearest node with memory to the cpu in
question, rather than the node it is running on.

Signed-off-by: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Ben Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kthread.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index b5ae3ee860a9..9a130ec06f7a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -217,7 +217,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
 	if (tsk == kthreadd_task)
 		return tsk->pref_node_fork;
 #endif
-	return numa_node_id();
+	return NUMA_NO_NODE;
 }
 
 static void create_kthread(struct kthread_create_info *create)
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 {
 	struct task_struct *p;
 
-	p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+	p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
 				   cpu);
 	if (IS_ERR(p))
 		return p;
-- 
cgit v1.3-14-g43fede


From 62572e29bc530b38921ef6059088b4788a9832a5 Mon Sep 17 00:00:00 2001
From: Ben Zhang <benzh@chromium.org>
Date: Thu, 3 Apr 2014 14:47:18 -0700
Subject: kernel/watchdog.c: touch_nmi_watchdog should only touch local cpu not
 every one

I ran into a scenario where while one cpu was stuck and should have
panic'd because of the NMI watchdog, it didn't.  The reason was another
cpu was spewing stack dumps on to the console.  Upon investigation, I
noticed that when writing to the console and also when dumping the
stack, the watchdog is touched.

This causes all the cpus to reset their NMI watchdog flags and the
'stuck' cpu just spins forever.

This change causes the semantics of touch_nmi_watchdog to be changed
slightly.  Previously, I accidentally changed the semantics and we
noticed there was a codepath in which touch_nmi_watchdog could be
touched from a preemtible area.  That caused a BUG() to happen when
CONFIG_DEBUG_PREEMPT was enabled.  I believe it was the acpi code.

My attempt here re-introduces the change to have the
touch_nmi_watchdog() code only touch the local cpu instead of all of the
cpus.  But instead of using __get_cpu_var(), I use the
__raw_get_cpu_var() version.

This avoids the preemption problem.  However my reasoning wasn't because
I was trying to be lazy.  Instead I rationalized it as, well if
preemption is enabled then interrupts should be enabled to and the NMI
watchdog will have no reason to trigger.  So it won't matter if the
wrong cpu is touched because the percpu interrupt counters the NMI
watchdog uses should still be incrementing.

Don said:

: I'm ok with this patch, though it does alter the behaviour of how
: touch_nmi_watchdog works.  For the most part I don't think most callers
: need to touch all of the watchdogs (on each cpu).  Perhaps a corner case
: will pop up (the scheduler??  to mimic touch_all_softlockup_watchdogs() ).
:
: But this does address an issue where if a system is locked up and one cpu
: is spewing out useful debug messages (or error messages), the hard lockup
: will fail to go off.  We have seen this on RHEL also.

Signed-off-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Ben Zhang <benzh@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 01c6f979486f..e90089fd78e0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -158,14 +158,14 @@ void touch_all_softlockup_watchdogs(void)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
-	if (watchdog_user_enabled) {
-		unsigned cpu;
-
-		for_each_present_cpu(cpu) {
-			if (per_cpu(watchdog_nmi_touch, cpu) != true)
-				per_cpu(watchdog_nmi_touch, cpu) = true;
-		}
-	}
+	/*
+	 * Using __raw here because some code paths have
+	 * preemption enabled.  If preemption is enabled
+	 * then interrupts should be enabled too, in which
+	 * case we shouldn't have to worry about the watchdog
+	 * going off.
+	 */
+	__raw_get_cpu_var(watchdog_nmi_touch) = true;
 	touch_softlockup_watchdog();
 }
 EXPORT_SYMBOL(touch_nmi_watchdog);
-- 
cgit v1.3-14-g43fede


From d26914d11751b23ca2e8747725f2cae10c2f2c1b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Thu, 3 Apr 2014 14:47:24 -0700
Subject: mm: optimize put_mems_allowed() usage

Since put_mems_allowed() is strictly optional, its a seqcount retry, we
don't need to evaluate the function if the allocation was in fact
successful, saving a smp_rmb some loads and comparisons on some relative
fast-paths.

Since the naming, get/put_mems_allowed() does suggest a mandatory
pairing, rename the interface, as suggested by Mel, to resemble the
seqcount interface.

This gives us: read_mems_allowed_begin() and read_mems_allowed_retry(),
where it is important to note that the return value of the latter call
is inverted from its previous incarnation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 27 ++++++++++++++-------------
 kernel/cpuset.c        |  2 +-
 mm/filemap.c           |  4 ++--
 mm/hugetlb.c           |  4 ++--
 mm/mempolicy.c         | 12 ++++++------
 mm/page_alloc.c        |  8 ++++----
 mm/slab.c              |  4 ++--
 mm/slub.c              | 16 +++++++---------
 8 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3fe661fe96d1..b19d3dc2e651 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -87,25 +87,26 @@ extern void rebuild_sched_domains(void);
 extern void cpuset_print_task_mems_allowed(struct task_struct *p);
 
 /*
- * get_mems_allowed is required when making decisions involving mems_allowed
- * such as during page allocation. mems_allowed can be updated in parallel
- * and depending on the new value an operation can fail potentially causing
- * process failure. A retry loop with get_mems_allowed and put_mems_allowed
- * prevents these artificial failures.
+ * read_mems_allowed_begin is required when making decisions involving
+ * mems_allowed such as during page allocation. mems_allowed can be updated in
+ * parallel and depending on the new value an operation can fail potentially
+ * causing process failure. A retry loop with read_mems_allowed_begin and
+ * read_mems_allowed_retry prevents these artificial failures.
  */
-static inline unsigned int get_mems_allowed(void)
+static inline unsigned int read_mems_allowed_begin(void)
 {
 	return read_seqcount_begin(&current->mems_allowed_seq);
 }
 
 /*
- * If this returns false, the operation that took place after get_mems_allowed
- * may have failed. It is up to the caller to retry the operation if
+ * If this returns true, the operation that took place after
+ * read_mems_allowed_begin may have failed artificially due to a concurrent
+ * update of mems_allowed. It is up to the caller to retry the operation if
  * appropriate.
  */
-static inline bool put_mems_allowed(unsigned int seq)
+static inline bool read_mems_allowed_retry(unsigned int seq)
 {
-	return !read_seqcount_retry(&current->mems_allowed_seq, seq);
+	return read_seqcount_retry(&current->mems_allowed_seq, seq);
 }
 
 static inline void set_mems_allowed(nodemask_t nodemask)
@@ -225,14 +226,14 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 {
 }
 
-static inline unsigned int get_mems_allowed(void)
+static inline unsigned int read_mems_allowed_begin(void)
 {
 	return 0;
 }
 
-static inline bool put_mems_allowed(unsigned int seq)
+static inline bool read_mems_allowed_retry(unsigned int seq)
 {
-	return true;
+	return false;
 }
 
 #endif /* !CONFIG_CPUSETS */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e6b1b66afe52..f6fc7475f1a1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1022,7 +1022,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
 	task_lock(tsk);
 	/*
 	 * Determine if a loop is necessary if another thread is doing
-	 * get_mems_allowed().  If at least one node remains unchanged and
+	 * read_mems_allowed_begin().  If at least one node remains unchanged and
 	 * tsk does not have a mempolicy, then an empty nodemask will not be
 	 * possible when mems_allowed is larger than a word.
 	 */
diff --git a/mm/filemap.c b/mm/filemap.c
index 7a13f6ac5421..068cd2a63d32 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -520,10 +520,10 @@ struct page *__page_cache_alloc(gfp_t gfp)
 	if (cpuset_do_page_mem_spread()) {
 		unsigned int cpuset_mems_cookie;
 		do {
-			cpuset_mems_cookie = get_mems_allowed();
+			cpuset_mems_cookie = read_mems_allowed_begin();
 			n = cpuset_mem_spread_node();
 			page = alloc_pages_exact_node(n, gfp, 0);
-		} while (!put_mems_allowed(cpuset_mems_cookie) && !page);
+		} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
 
 		return page;
 	}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c01cb9fedb18..139b7462203b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -540,7 +540,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 		goto err;
 
 retry_cpuset:
-	cpuset_mems_cookie = get_mems_allowed();
+	cpuset_mems_cookie = read_mems_allowed_begin();
 	zonelist = huge_zonelist(vma, address,
 					htlb_alloc_mask(h), &mpol, &nodemask);
 
@@ -562,7 +562,7 @@ retry_cpuset:
 	}
 
 	mpol_cond_put(mpol);
-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4755c8576942..e3ab02822799 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1899,7 +1899,7 @@ int node_random(const nodemask_t *maskp)
  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
  * @nodemask for filtering the zonelist.
  *
- * Must be protected by get_mems_allowed()
+ * Must be protected by read_mems_allowed_begin()
  */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
 				gfp_t gfp_flags, struct mempolicy **mpol,
@@ -2063,7 +2063,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 retry_cpuset:
 	pol = get_vma_policy(current, vma, addr);
-	cpuset_mems_cookie = get_mems_allowed();
+	cpuset_mems_cookie = read_mems_allowed_begin();
 
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
@@ -2071,7 +2071,7 @@ retry_cpuset:
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
-		if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+		if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 			goto retry_cpuset;
 
 		return page;
@@ -2081,7 +2081,7 @@ retry_cpuset:
 				      policy_nodemask(gfp, pol));
 	if (unlikely(mpol_needs_cond_ref(pol)))
 		__mpol_put(pol);
-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return page;
 }
@@ -2115,7 +2115,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 		pol = &default_policy;
 
 retry_cpuset:
-	cpuset_mems_cookie = get_mems_allowed();
+	cpuset_mems_cookie = read_mems_allowed_begin();
 
 	/*
 	 * No reference counting needed for current->mempolicy
@@ -2128,7 +2128,7 @@ retry_cpuset:
 				policy_zonelist(gfp, pol, numa_node_id()),
 				policy_nodemask(gfp, pol));
 
-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 
 	return page;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3bac76ae4b30..979378deccbf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2739,7 +2739,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 		return NULL;
 
 retry_cpuset:
-	cpuset_mems_cookie = get_mems_allowed();
+	cpuset_mems_cookie = read_mems_allowed_begin();
 
 	/* The preferred zone is used for statistics later */
 	first_zones_zonelist(zonelist, high_zoneidx,
@@ -2777,7 +2777,7 @@ out:
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 
 	memcg_kmem_commit_charge(page, memcg, order);
@@ -3045,9 +3045,9 @@ bool skip_free_areas_node(unsigned int flags, int nid)
 		goto out;
 
 	do {
-		cpuset_mems_cookie = get_mems_allowed();
+		cpuset_mems_cookie = read_mems_allowed_begin();
 		ret = !node_isset(nid, cpuset_current_mems_allowed);
-	} while (!put_mems_allowed(cpuset_mems_cookie));
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 out:
 	return ret;
 }
diff --git a/mm/slab.c b/mm/slab.c
index b264214c77ea..9153c802e2fe 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3073,7 +3073,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
 retry_cpuset:
-	cpuset_mems_cookie = get_mems_allowed();
+	cpuset_mems_cookie = read_mems_allowed_begin();
 	zonelist = node_zonelist(slab_node(), flags);
 
 retry:
@@ -3131,7 +3131,7 @@ retry:
 		}
 	}
 
-	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
+	if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 	return obj;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 25f14ad8f817..7611f148ee81 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 		return NULL;
 
 	do {
-		cpuset_mems_cookie = get_mems_allowed();
+		cpuset_mems_cookie = read_mems_allowed_begin();
 		zonelist = node_zonelist(slab_node(), flags);
 		for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 			struct kmem_cache_node *n;
@@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 				object = get_partial_node(s, n, c, flags);
 				if (object) {
 					/*
-					 * Return the object even if
-					 * put_mems_allowed indicated that
-					 * the cpuset mems_allowed was
-					 * updated in parallel. It's a
-					 * harmless race between the alloc
-					 * and the cpuset update.
+					 * Don't check read_mems_allowed_retry()
+					 * here - if mems_allowed was updated in
+					 * parallel, that was a harmless race
+					 * between allocation and the cpuset
+					 * update
 					 */
-					put_mems_allowed(cpuset_mems_cookie);
 					return object;
 				}
 			}
 		}
-	} while (!put_mems_allowed(cpuset_mems_cookie));
+	} while (read_mems_allowed_retry(cpuset_mems_cookie));
 #endif
 	return NULL;
 }
-- 
cgit v1.3-14-g43fede


From 5509a5d27b971a90b940e148ca9ca53312e4fa7a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@linux.vnet.ibm.com>
Date: Thu, 3 Apr 2014 14:48:19 -0700
Subject: drop_caches: add some documentation and info message

There is plenty of anecdotal evidence and a load of blog posts
suggesting that using "drop_caches" periodically keeps your system
running in "tip top shape".  Perhaps adding some kernel documentation
will increase the amount of accurate data on its use.

If we are not shrinking caches effectively, then we have real bugs.
Using drop_caches will simply mask the bugs and make them harder to
find, but certainly does not fix them, nor is it an appropriate
"workaround" to limit the size of the caches.  On the contrary, there
have been bug reports on issues that turned out to be misguided use of
cache dropping.

Dropping caches is a very drastic and disruptive operation that is good
for debugging and running tests, but if it creates bug reports from
production use, kernel developers should be aware of its use.

Add a bit more documentation about it, a syslog message to track down
abusers, and vmstat drop counters to help analyze problem reports.

[akpm@linux-foundation.org: checkpatch fixes]
[hannes@cmpxchg.org: add runtime suppression control]
Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt   | 33 +++++++++++++++++++++++++++------
 fs/drop_caches.c              | 16 ++++++++++++++--
 include/linux/vm_event_item.h |  1 +
 kernel/sysctl.c               |  4 ++--
 mm/vmstat.c                   |  3 +++
 5 files changed, 47 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index d614a9b6a280..dd9d0e33b443 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -175,18 +175,39 @@ Setting this to zero disables periodic writeback altogether.
 
 drop_caches
 
-Writing to this will cause the kernel to drop clean caches, dentries and
-inodes from memory, causing that memory to become free.
+Writing to this will cause the kernel to drop clean caches, as well as
+reclaimable slab objects like dentries and inodes.  Once dropped, their
+memory becomes free.
 
 To free pagecache:
 	echo 1 > /proc/sys/vm/drop_caches
-To free dentries and inodes:
+To free reclaimable slab objects (includes dentries and inodes):
 	echo 2 > /proc/sys/vm/drop_caches
-To free pagecache, dentries and inodes:
+To free slab objects and pagecache:
 	echo 3 > /proc/sys/vm/drop_caches
 
-As this is a non-destructive operation and dirty objects are not freeable, the
-user should run `sync' first.
+This is a non-destructive operation and will not free any dirty objects.
+To increase the number of objects freed by this operation, the user may run
+`sync' prior to writing to /proc/sys/vm/drop_caches.  This will minimize the
+number of dirty objects on the system and create more candidates to be
+dropped.
+
+This file is not a means to control the growth of the various kernel caches
+(inodes, dentries, pagecache, etc...)  These objects are automatically
+reclaimed by the kernel when memory is needed elsewhere on the system.
+
+Use of this file can cause performance problems.  Since it discards cached
+objects, it may cost a significant amount of I/O and CPU to recreate the
+dropped objects, especially if they were under heavy use.  Because of this,
+use outside of a testing or debugging environment is not recommended.
+
+You may see informational messages in your kernel log when this file is
+used:
+
+	cat (1234): drop_caches: 3
+
+These are informational only.  They do not mean that anything is wrong
+with your system.  To disable them, echo 4 (bit 3) into drop_caches.
 
 ==============================================================
 
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb2..9280202e488c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
 	if (ret)
 		return ret;
 	if (write) {
-		if (sysctl_drop_caches & 1)
+		static int stfu;
+
+		if (sysctl_drop_caches & 1) {
 			iterate_supers(drop_pagecache_sb, NULL);
-		if (sysctl_drop_caches & 2)
+			count_vm_event(DROP_PAGECACHE);
+		}
+		if (sysctl_drop_caches & 2) {
 			drop_slab();
+			count_vm_event(DROP_SLAB);
+		}
+		if (!stfu) {
+			pr_info("%s (%d): drop_caches: %d\n",
+				current->comm, task_pid_nr(current),
+				sysctl_drop_caches);
+		}
+		stfu |= sysctl_drop_caches & 4;
 	}
 	return 0;
 }
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 3a712e2e7d76..486c3972c0be 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -37,6 +37,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		PGINODESTEAL, SLABS_SCANNED, KSWAPD_INODESTEAL,
 		KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
 		PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+		DROP_PAGECACHE, DROP_SLAB,
 #ifdef CONFIG_NUMA_BALANCING
 		NUMA_PTE_UPDATES,
 		NUMA_HUGE_PTE_UPDATES,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 09d2e2413605..5c14b547882e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -123,7 +123,7 @@ static int __maybe_unused neg_one = -1;
 static int zero;
 static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
-static int __maybe_unused three = 3;
+static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 #ifdef CONFIG_PRINTK
@@ -1264,7 +1264,7 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= drop_caches_sysctl_handler,
 		.extra1		= &one,
-		.extra2		= &three,
+		.extra2		= &four,
 	},
 #ifdef CONFIG_COMPACTION
 	{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f3155d51acfd..197b4c4a9587 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -813,6 +813,9 @@ const char * const vmstat_text[] = {
 
 	"pgrotated",
 
+	"drop_pagecache",
+	"drop_slab",
+
 #ifdef CONFIG_NUMA_BALANCING
 	"numa_pte_updates",
 	"numa_huge_pte_updates",
-- 
cgit v1.3-14-g43fede


From 6af9f7bf3c399e0ab1eee048e13572c6d4e15fe9 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Thu, 3 Apr 2014 14:48:25 -0700
Subject: sys_sysfs: Add CONFIG_SYSFS_SYSCALL

sys_sysfs is an obsolete system call no longer supported by libc.

 - This patch adds a default CONFIG_SYSFS_SYSCALL=y

 - Option can be turned off in expert mode.

 - cond_syscall added to kernel/sys_ni.c

[akpm@linux-foundation.org: tweak Kconfig help text]
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/filesystems.c |  2 ++
 init/Kconfig     | 10 ++++++++++
 kernel/sys_ni.c  |  1 +
 3 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6a..5797d45a78cb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
 
 EXPORT_SYMBOL(unregister_filesystem);
 
+#ifdef CONFIG_SYSFS_SYSCALL
 static int fs_index(const char __user * __name)
 {
 	struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
 	}
 	return retval;
 }
+#endif
 
 int __init get_filesystem_list(char *buf)
 {
diff --git a/init/Kconfig b/init/Kconfig
index d56cb03c1b49..e45cc62904b3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1290,6 +1290,16 @@ config UID16
 	help
 	  This enables the legacy 16-bit UID syscall wrappers.
 
+config SYSFS_SYSCALL
+	bool "Sysfs syscall support" if EXPERT
+	default y
+	---help---
+	  sys_sysfs is an obsolete system call no longer supported in libc.
+	  Note that disabling this option is more secure but might break
+	  compatibility with some systems.
+
+	  If unsure say Y here.
+
 config SYSCTL_SYSCALL
 	bool "Sysctl syscall support" if EXPERT
 	depends on PROC_SYSCTL
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7078052284fd..74395a95b7e9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -146,6 +146,7 @@ cond_syscall(sys_io_destroy);
 cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
+cond_syscall(sys_sysfs);
 cond_syscall(sys_syslog);
 cond_syscall(sys_process_vm_readv);
 cond_syscall(sys_process_vm_writev);
-- 
cgit v1.3-14-g43fede


From 8f6c5ffc8987f4f5b5a3e9d557d94bbf3a9bf216 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Thu, 3 Apr 2014 14:48:26 -0700
Subject: kernel/groups.c: remove return value of set_groups

After commit 6307f8fee295 ("security: remove dead hook task_setgroups"),
set_groups will always return zero, so we could just remove return value
of set_groups.

This patch reduces code size, and simplfies code to use set_groups,
because we don't need to check its return value any more.

[akpm@linux-foundation.org: remove obsolete claims from set_groups() comment]
Signed-off-by: Wang YanQing <udknight@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Serge Hallyn <serge.hallyn@canonical.com>
Cc: Eric Paris <eparis@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfsd/auth.c       |  5 +----
 include/linux/cred.h |  2 +-
 kernel/groups.c      | 14 ++------------
 3 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd572264..2645be435e75 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -71,10 +71,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 	if (gid_eq(new->fsgid, INVALID_GID))
 		new->fsgid = exp->ex_anon_gid;
 
-	ret = set_groups(new, gi);
+	set_groups(new, gi);
 	put_group_info(gi);
-	if (ret < 0)
-		goto error;
 
 	if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
 		new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -89,7 +87,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 
 oom:
 	ret = -ENOMEM;
-error:
 	abort_creds(new);
 	return ret;
 }
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 04421e825365..f61d6c8f5ef3 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -66,7 +66,7 @@ extern struct group_info *groups_alloc(int);
 extern struct group_info init_groups;
 extern void groups_free(struct group_info *);
 extern int set_current_groups(struct group_info *);
-extern int set_groups(struct cred *, struct group_info *);
+extern void set_groups(struct cred *, struct group_info *);
 extern int groups_search(const struct group_info *, kgid_t);
 
 /* access the groups "array" with this macro */
diff --git a/kernel/groups.c b/kernel/groups.c
index 90cf1c38c8ea..451698f86cfa 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -157,17 +157,13 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
  * set_groups - Change a group subscription in a set of credentials
  * @new: The newly prepared set of credentials to alter
  * @group_info: The group list to install
- *
- * Validate a group subscription and, if valid, insert it into a set
- * of credentials.
  */
-int set_groups(struct cred *new, struct group_info *group_info)
+void set_groups(struct cred *new, struct group_info *group_info)
 {
 	put_group_info(new->group_info);
 	groups_sort(group_info);
 	get_group_info(group_info);
 	new->group_info = group_info;
-	return 0;
 }
 
 EXPORT_SYMBOL(set_groups);
@@ -182,18 +178,12 @@ EXPORT_SYMBOL(set_groups);
 int set_current_groups(struct group_info *group_info)
 {
 	struct cred *new;
-	int ret;
 
 	new = prepare_creds();
 	if (!new)
 		return -ENOMEM;
 
-	ret = set_groups(new, group_info);
-	if (ret < 0) {
-		abort_creds(new);
-		return ret;
-	}
-
+	set_groups(new, group_info);
 	return commit_creds(new);
 }
 
-- 
cgit v1.3-14-g43fede


From 69369a7003735d0d8ef22097e27a55a8bad9557a Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Thu, 3 Apr 2014 14:48:27 -0700
Subject: fs, kernel: permit disabling the uselib syscall

uselib hasn't been used since libc5; glibc does not use it.  Support
turning it off.

When disabled, also omit the load_elf_library implementation from
binfmt_elf.c, which only uselib invokes.

bloat-o-meter:
add/remove: 0/4 grow/shrink: 0/1 up/down: 0/-785 (-785)
function                                     old     new   delta
padzero                                       39      36      -3
uselib_flags                                  20       -     -20
sys_uselib                                   168       -    -168
SyS_uselib                                   168       -    -168
load_elf_library                             426       -    -426

The new CONFIG_USELIB defaults to `y'.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_elf.c |  9 ++++++++-
 fs/exec.c       |  2 ++
 init/Kconfig    | 10 ++++++++++
 kernel/sys_ni.c |  1 +
 4 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 67be2951b98a..0f59799fa105 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
 #endif
 
 static int load_elf_binary(struct linux_binprm *bprm);
-static int load_elf_library(struct file *);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 				int, int, unsigned long);
 
+#ifdef CONFIG_USELIB
+static int load_elf_library(struct file *);
+#else
+#define load_elf_library NULL
+#endif
+
 /*
  * If we don't support core dumping, then supply a NULL so we
  * don't even try.
@@ -1005,6 +1010,7 @@ out_free_ph:
 	goto out;
 }
 
+#ifdef CONFIG_USELIB
 /* This is really simpleminded and specialized - we are loading an
    a.out library that is given an ELF header. */
 static int load_elf_library(struct file *file)
@@ -1083,6 +1089,7 @@ out_free_ph:
 out:
 	return error;
 }
+#endif /* #ifdef CONFIG_USELIB */
 
 #ifdef CONFIG_ELF_CORE
 /*
diff --git a/fs/exec.c b/fs/exec.c
index 4f59402fdda5..25dfeba6d55f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -97,6 +97,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
 	module_put(fmt->module);
 }
 
+#ifdef CONFIG_USELIB
 /*
  * Note that a shared library must be both readable and executable due to
  * security reasons.
@@ -156,6 +157,7 @@ exit:
 out:
   	return error;
 }
+#endif /* #ifdef CONFIG_USELIB */
 
 #ifdef CONFIG_MMU
 /*
diff --git a/init/Kconfig b/init/Kconfig
index e45cc62904b3..8114a06117e3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -273,6 +273,16 @@ config FHANDLE
 	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
 	  syscalls.
 
+config USELIB
+	bool "uselib syscall"
+	default y
+	help
+	  This option enables the uselib syscall, a system call used in the
+	  dynamic linker from libc5 and earlier.  glibc does not use this
+	  system call.  If you intend to run programs built on libc5 or
+	  earlier, you may need to enable this syscall.  Current systems
+	  running glibc can safely disable this.
+
 config AUDIT
 	bool "Auditing support"
 	depends on NET
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 74395a95b7e9..bc8d1b74a6b9 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -152,6 +152,7 @@ cond_syscall(sys_process_vm_readv);
 cond_syscall(sys_process_vm_writev);
 cond_syscall(compat_sys_process_vm_readv);
 cond_syscall(compat_sys_process_vm_writev);
+cond_syscall(sys_uselib);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.3-14-g43fede


From c96d6660dc65b0a90aea9834bfd8be1d5656da18 Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Thu, 3 Apr 2014 14:48:35 -0700
Subject: kernel: audit/fix non-modular users of module_init in core code

Code that is obj-y (always built-in) or dependent on a bool Kconfig
(built-in or absent) can never be modular.  So using module_init as an
alias for __initcall can be somewhat misleading.

Fix these up now, so that we can relocate module_init from init.h into
module.h in the future.  If we don't do this, we'd have to add module.h
to obviously non-modular code, and that would be a worse thing.

The audit targets the following module_init users for change:
 kernel/user.c                  obj-y
 kernel/kexec.c                 bool KEXEC (one instance per arch)
 kernel/profile.c               bool PROFILING
 kernel/hung_task.c             bool DETECT_HUNG_TASK
 kernel/sched/stats.c           bool SCHEDSTATS
 kernel/user_namespace.c        bool USER_NS

Note that direct use of __initcall is discouraged, vs.  one of the
priority categorized subgroups.  As __initcall gets mapped onto
device_initcall, our use of subsys_initcall (which makes sense for these
files) will thus change this registration from level 6-device to level
4-subsys (i.e.  slightly earlier).  However no observable impact of that
difference has been observed during testing.

Also, two instances of missing ";" at EOL are fixed in kexec.

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/hung_task.c      | 3 +--
 kernel/kexec.c          | 4 ++--
 kernel/profile.c        | 2 +-
 kernel/sched/stats.c    | 2 +-
 kernel/user.c           | 3 +--
 kernel/user_namespace.c | 2 +-
 6 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0b9c169d577f..06bb1417b063 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -246,5 +246,4 @@ static int __init hung_task_init(void)
 
 	return 0;
 }
-
-module_init(hung_task_init);
+subsys_initcall(hung_task_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 45601cf41bee..c0d261c7db7b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1235,7 +1235,7 @@ static int __init crash_notes_memory_init(void)
 	}
 	return 0;
 }
-module_init(crash_notes_memory_init)
+subsys_initcall(crash_notes_memory_init);
 
 
 /*
@@ -1629,7 +1629,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	return 0;
 }
 
-module_init(crash_save_vmcoreinfo_init)
+subsys_initcall(crash_save_vmcoreinfo_init);
 
 /*
  * Move into place and start executing a preloaded standalone
diff --git a/kernel/profile.c b/kernel/profile.c
index ebdd9c1a86b4..1b266dbe755a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -604,5 +604,5 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
 	hotcpu_notifier(profile_cpu_callback, 0);
 	return 0;
 }
-module_init(create_proc_profile);
+subsys_initcall(create_proc_profile);
 #endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
 	proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
 	return 0;
 }
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);
diff --git a/kernel/user.c b/kernel/user.c
index c006131beb77..294fc6a94168 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -222,5 +222,4 @@ static int __init uid_cache_init(void)
 
 	return 0;
 }
-
-module_init(uid_cache_init);
+subsys_initcall(uid_cache_init);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index dd06439b9c84..0d8f6023fd8d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -902,4 +902,4 @@ static __init int user_namespaces_init(void)
 	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
 	return 0;
 }
-module_init(user_namespaces_init);
+subsys_initcall(user_namespaces_init);
-- 
cgit v1.3-14-g43fede


From 28ab49ff7f3dcaf4df8d2bd0d4099b8c08285ed7 Mon Sep 17 00:00:00 2001
From: Daeseok Youn <daeseok.youn@gmail.com>
Date: Thu, 3 Apr 2014 14:48:36 -0700
Subject: kernel/resource.c: make reallocate_resource() static

sparse says:

kernel/resource.c:518:5: warning:
 symbol 'reallocate_resource' was not declared. Should it be static?

Signed-off-by: Daeseok Youn <daeseok.youn@gmail.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/resource.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 673061c06da1..8957d686e29b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -511,7 +511,7 @@ static int find_resource(struct resource *root, struct resource *new,
  * @newsize: new size of the resource descriptor
  * @constraint: the size and alignment constraints to be met.
  */
-int reallocate_resource(struct resource *root, struct resource *old,
+static int reallocate_resource(struct resource *root, struct resource *old,
 			resource_size_t newsize,
 			struct resource_constraint  *constraint)
 {
-- 
cgit v1.3-14-g43fede


From c64730b26f08cccfbc8fcbf169c304b4bd71dcac Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Thu, 3 Apr 2014 14:48:39 -0700
Subject: printk: remove obsolete check for log level "c"

The kernel log level "c" was removed in commit 61e99ab8e35a ("printk:
remove the now unnecessary "C" annotation for KERN_CONT").  It is no
longer detected in printk_get_level().  Hence we do not need to check it
in vprintk_emit.

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 4dae9cbe9259..db7a02e05241 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1560,8 +1560,6 @@ asmlinkage int vprintk_emit(int facility, int level,
 					level = kern_level - '0';
 			case 'd':	/* KERN_DEFAULT */
 				lflags |= LOG_PREFIX;
-			case 'c':	/* KERN_CONT */
-				break;
 			}
 			text_len -= end_of_header - text;
 			text = (char *)end_of_header;
-- 
cgit v1.3-14-g43fede


From e8c42d36ab86cf45f88c3a0e344233b1032fbf3d Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Thu, 3 Apr 2014 14:48:41 -0700
Subject: printk: add comment about tricky check for text buffer size

There is no check for potential "text_len" overflow.  It is not needed
because only valid level is detected.  It took me some time to
understand why.  It would deserve a comment ;-)

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index db7a02e05241..012f3e40671d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1561,6 +1561,11 @@ asmlinkage int vprintk_emit(int facility, int level,
 			case 'd':	/* KERN_DEFAULT */
 				lflags |= LOG_PREFIX;
 			}
+			/*
+			 * No need to check length here because vscnprintf
+			 * put '\0' at the end of the string. Only valid and
+			 * newly printed level is detected.
+			 */
 			text_len -= end_of_header - text;
 			text = (char *)end_of_header;
 		}
-- 
cgit v1.3-14-g43fede


From 39b25109b400ea397e64c417d8b965a53e2ee0f0 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Thu, 3 Apr 2014 14:48:42 -0700
Subject: printk: use also the last bytes in the ring buffer

It seems that we have newer used the last byte in the ring buffer.  In
fact, we have newer used the last 4 bytes because of padding.

First problem is in the check for free space.  The exact number of free
bytes is enough to store the length of data.

Second problem is in the check where the ring buffer is rotated.  The
left side counts the first unused index.  It is unused, so it might be
the same as the size of the buffer.

Note that the first problem has to be fixed together with the second
one.  Otherwise, the buffer is rotated even when there is enough space
on the end of the buffer.  Then the beginning of the buffer is rewritten
and valid entries get corrupted.

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 012f3e40671d..b3a1790f9e05 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -319,7 +319,7 @@ static void log_store(int facility, int level,
 		else
 			free = log_first_idx - log_next_idx;
 
-		if (free > size + sizeof(struct printk_log))
+		if (free >= size + sizeof(struct printk_log))
 			break;
 
 		/* drop old messages until we have enough contiuous space */
@@ -327,7 +327,7 @@ static void log_store(int facility, int level,
 		log_first_seq++;
 	}
 
-	if (log_next_idx + size + sizeof(struct printk_log) >= log_buf_len) {
+	if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
 		/*
 		 * This message + an additional empty header does not fit
 		 * at the end of the buffer. Add an empty header with len == 0
-- 
cgit v1.3-14-g43fede


From fce6e0338abe910ba6d4db0657ae8adc6aa1a72b Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.cz>
Date: Thu, 3 Apr 2014 14:48:43 -0700
Subject: printk: do not compute the size of the message twice

This is just a tiny optimization.  It removes duplicate computation of
the message size.

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Kay Sievers <kay@vrfy.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b3a1790f9e05..ff9faf4e3cd5 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -351,7 +351,7 @@ static void log_store(int facility, int level,
 	else
 		msg->ts_nsec = local_clock();
 	memset(log_dict(msg) + dict_len, 0, pad_len);
-	msg->len = sizeof(struct printk_log) + text_len + dict_len + pad_len;
+	msg->len = size;
 
 	/* insert message */
 	log_next_idx += msg->len;
-- 
cgit v1.3-14-g43fede


From 72581487a61f6304a7cc32e189eb65fb1c920a53 Mon Sep 17 00:00:00 2001
From: Jane Li <jiel@marvell.com>
Date: Thu, 3 Apr 2014 14:48:45 -0700
Subject: printk: fix one circular lockdep warning about console_lock

Fix a warning about possible circular locking dependency.

If do in following sequence:

    enter suspend ->  resume ->  plug-out CPUx (echo 0 > cpux/online)

lockdep will show warning as following:

  ======================================================
  [ INFO: possible circular locking dependency detected ]
  3.10.0 #2 Tainted: G           O
  -------------------------------------------------------
  sh/1271 is trying to acquire lock:
  (console_lock){+.+.+.}, at: console_cpu_notify+0x20/0x2c
  but task is already holding lock:
  (cpu_hotplug.lock){+.+.+.}, at: cpu_hotplug_begin+0x2c/0x58
  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:
  -> #2 (cpu_hotplug.lock){+.+.+.}:
    lock_acquire+0x98/0x12c
    mutex_lock_nested+0x50/0x3d8
    cpu_hotplug_begin+0x2c/0x58
    _cpu_up+0x24/0x154
    cpu_up+0x64/0x84
    smp_init+0x9c/0xd4
    kernel_init_freeable+0x78/0x1c8
    kernel_init+0x8/0xe4
    ret_from_fork+0x14/0x2c

  -> #1 (cpu_add_remove_lock){+.+.+.}:
    lock_acquire+0x98/0x12c
    mutex_lock_nested+0x50/0x3d8
    disable_nonboot_cpus+0x8/0xe8
    suspend_devices_and_enter+0x214/0x448
    pm_suspend+0x1e4/0x284
    try_to_suspend+0xa4/0xbc
    process_one_work+0x1c4/0x4fc
    worker_thread+0x138/0x37c
    kthread+0xa4/0xb0
    ret_from_fork+0x14/0x2c

  -> #0 (console_lock){+.+.+.}:
    __lock_acquire+0x1b38/0x1b80
    lock_acquire+0x98/0x12c
    console_lock+0x54/0x68
    console_cpu_notify+0x20/0x2c
    notifier_call_chain+0x44/0x84
    __cpu_notify+0x2c/0x48
    cpu_notify_nofail+0x8/0x14
    _cpu_down+0xf4/0x258
    cpu_down+0x24/0x40
    store_online+0x30/0x74
    dev_attr_store+0x18/0x24
    sysfs_write_file+0x16c/0x19c
    vfs_write+0xb4/0x190
    SyS_write+0x3c/0x70
    ret_fast_syscall+0x0/0x48

  Chain exists of:
     console_lock --> cpu_add_remove_lock --> cpu_hotplug.lock

  Possible unsafe locking scenario:
         CPU0                    CPU1
         ----                    ----
  lock(cpu_hotplug.lock);
                                 lock(cpu_add_remove_lock);
                                 lock(cpu_hotplug.lock);
  lock(console_lock);
    *** DEADLOCK ***

There are three locks involved in two sequence:
a) pm suspend:
	console_lock (@suspend_console())
	cpu_add_remove_lock (@disable_nonboot_cpus())
	cpu_hotplug.lock (@_cpu_down())
b) Plug-out CPUx:
	cpu_add_remove_lock (@(cpu_down())
	cpu_hotplug.lock (@_cpu_down())
	console_lock (@console_cpu_notify()) => Lockdeps prints warning log.

There should be not real deadlock, as flag of console_suspended can
protect this.

Although console_suspend() releases console_sem, it doesn't tell lockdep
about it.  That results in the lockdep warning about circular locking
when doing the following: enter suspend -> resume -> plug-out CPUx (echo
0 > cpux/online)

Fix the problem by telling lockdep we actually released the semaphore in
console_suspend() and acquired it again in console_resume().

Signed-off-by: Jane Li <jiel@marvell.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/printk.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ff9faf4e3cd5..a45b50962295 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1883,6 +1883,7 @@ void suspend_console(void)
 	console_lock();
 	console_suspended = 1;
 	up(&console_sem);
+	mutex_release(&console_lock_dep_map, 1, _RET_IP_);
 }
 
 void resume_console(void)
@@ -1890,6 +1891,7 @@ void resume_console(void)
 	if (!console_suspend_enabled)
 		return;
 	down(&console_sem);
+	mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);
 	console_suspended = 0;
 	console_unlock();
 }
-- 
cgit v1.3-14-g43fede


From c6b3d5bcd67c75961a1e8b9564d1475c0f194a84 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 4 Apr 2014 17:14:41 +0800
Subject: cgroup: fix top cgroup refcnt leak

As mount() and kill_sb() is not a one-to-one match, If we mount the same
cgroupfs in serveral mount points, and then umount all of them, kill_sb()
will be called only once.

Try:
        # mount -t cgroup -o cpuacct xxx /cgroup
        # mount -t cgroup -o cpuacct xxx /cgroup2
        # cat /proc/cgroups | grep cpuacct
        cpuacct 2       1       1
        # umount /cgroup
        # umount /cgroup2
        # cat /proc/cgroups | grep cpuacct
        cpuacct 2       1       1

You'll see cgroupfs will never be freed.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fede3d3f28ff..0dfc7324c789 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1487,6 +1487,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct cgroup_sb_opts opts;
 	struct dentry *dentry;
 	int ret;
+	bool new_sb;
 
 	/*
 	 * The first time anyone tries to mount a cgroup, enable the list
@@ -1603,8 +1604,8 @@ out_unlock:
 	if (ret)
 		return ERR_PTR(ret);
 
-	dentry = kernfs_mount(fs_type, flags, root->kf_root, NULL);
-	if (IS_ERR(dentry))
+	dentry = kernfs_mount(fs_type, flags, root->kf_root, &new_sb);
+	if (IS_ERR(dentry) || !new_sb)
 		cgroup_put(&root->cgrp);
 	return dentry;
 }
-- 
cgit v1.3-14-g43fede


From b8780c363d808a726a34793caa900923d32b6b80 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 7 Apr 2014 17:33:06 +0200
Subject: sched: remove sleep_on() and friends

This is the final piece in the puzzle, as all patches to remove the
last users of \(interruptible_\|\)sleep_on\(_timeout\|\) have made it
into the 3.15 merge window. The work was long overdue, and this
interface in particular should not have survived the BKL removal
that was done a couple of years ago.

Citing Jon Corbet from http://lwn.net/2001/0201/kernel.php3":

 "[...] it was suggested that the janitors look for and fix all code
  that calls sleep_on() [...] since (1) almost all such code is
  incorrect, and (2) Linus has agreed that those functions should
  be removed in the 2.5 development series".

We haven't quite made it for 2.5, but maybe we can merge this for 3.15.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/kernel-hacking.tmpl | 10 -------
 include/linux/wait.h                      | 11 --------
 kernel/sched/core.c                       | 46 -------------------------------
 3 files changed, 67 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index d0758b241b23..bd9015d10cff 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -850,16 +850,6 @@ printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
     <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
     The <function>wait_event()</function> version ignores signals.
    </para>
-   <para>
-   Do not use the <function>sleep_on()</function> function family -
-   it is very easy to accidentally introduce races; almost certainly
-   one of the <function>wait_event()</function> family will do, or a
-   loop around <function>schedule_timeout()</function>. If you choose
-   to loop around <function>schedule_timeout()</function> remember
-   you must set the task state (with 
-   <function>set_current_state()</function>) on each iteration to avoid
-   busy-looping.
-   </para>
  
   </sect1>
 
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 559044c79232..e7d9d9ed14f5 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -803,17 +803,6 @@ do {									\
 	__ret;								\
 })
 
-
-/*
- * These are the old interfaces to sleep waiting for an event.
- * They are racy.  DO NOT use them, use the wait_event* interfaces above.
- * We plan to remove these interfaces.
- */
-extern void sleep_on(wait_queue_head_t *q);
-extern long sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
-extern void interruptible_sleep_on(wait_queue_head_t *q);
-extern long interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout);
-
 /*
  * Waitqueues which are removed from the waitqueue_head at wakeup time
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d1b87b36778..0ff3f34bc7e3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2845,52 +2845,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
 }
 EXPORT_SYMBOL(default_wake_function);
 
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
-	unsigned long flags;
-	wait_queue_t wait;
-
-	init_waitqueue_entry(&wait, current);
-
-	__set_current_state(state);
-
-	spin_lock_irqsave(&q->lock, flags);
-	__add_wait_queue(q, &wait);
-	spin_unlock(&q->lock);
-	timeout = schedule_timeout(timeout);
-	spin_lock_irq(&q->lock);
-	__remove_wait_queue(q, &wait);
-	spin_unlock_irqrestore(&q->lock, flags);
-
-	return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
-	sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-	return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
-	sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
-	return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
 #ifdef CONFIG_RT_MUTEXES
 
 /*
-- 
cgit v1.3-14-g43fede


From 49957f8e2a43035a97d05bddefa394492a969c0d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 7 Apr 2014 16:44:47 -0400
Subject: cgroup: newly created dirs and files should be owned by the creator

While converting cgroup to kernfs, 2bd59d48ebfb ("cgroup: convert to
kernfs") accidentally dropped the logic which makes newly created
cgroup dirs and files owned by the current uid / gid.  This broke
cases where cgroup subtree management is delegated to !root as the sub
manager wouldn't be able to create more than single level of hierarchy
or put tasks into child cgroups it created.

Among other things, this breaks user session management in systemd and
one of the symptoms was 90s hang during shutdown.  User session
systemd running as the user creates a sub-service to initiate shutdown
and tries to put kill(1) into it but fails because cgroup.procs is
owned by root.  This leads to 90s hang during shutdown.

Implement cgroup_kn_set_ugid() which sets a kn's uid and gid to those
of the caller and use it from file and dir creation paths.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0dfc7324c789..9fcdaa705b6c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2346,11 +2346,26 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
 	return ret;
 }
 
+/* set uid and gid of cgroup dirs and files to that of the creator */
+static int cgroup_kn_set_ugid(struct kernfs_node *kn)
+{
+	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+			       .ia_uid = current_fsuid(),
+			       .ia_gid = current_fsgid(), };
+
+	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+		return 0;
+
+	return kernfs_setattr(kn, &iattr);
+}
+
 static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
 	struct lock_class_key *key = NULL;
+	int ret;
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	key = &cft->lockdep_key;
@@ -2358,7 +2373,13 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
 				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
 				  NULL, false, key);
-	return PTR_ERR_OR_ZERO(kn);
+	if (IS_ERR(kn))
+		return PTR_ERR(kn);
+
+	ret = cgroup_kn_set_ugid(kn);
+	if (ret)
+		kernfs_remove(kn);
+	return ret;
 }
 
 /**
@@ -3753,6 +3774,10 @@ static long cgroup_create(struct cgroup *parent, const char *name,
 	 */
 	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
 
+	err = cgroup_kn_set_ugid(kn);
+	if (err)
+		goto err_destroy;
+
 	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
 	if (err)
 		goto err_destroy;
-- 
cgit v1.3-14-g43fede


From a0715cc22601e8830ace98366c0c2bd8da52af52 Mon Sep 17 00:00:00 2001
From: Alex Thorlton <athorlton@sgi.com>
Date: Mon, 7 Apr 2014 15:37:10 -0700
Subject: mm, thp: add VM_INIT_DEF_MASK and PRCTL_THP_DISABLE

Add VM_INIT_DEF_MASK, to allow us to set the default flags for VMs.  It
also adds a prctl control which allows us to set the THP disable bit in
mm->def_flags so that VMs will pick up the setting as they are created.

Signed-off-by: Alex Thorlton <athorlton@sgi.com>
Suggested-by: Oleg Nesterov <oleg@redhat.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h         |  3 +++
 include/uapi/linux/prctl.h |  3 +++
 kernel/fork.c              | 11 ++++++++---
 kernel/sys.c               | 15 +++++++++++++++
 4 files changed, 29 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35300f390eb6..c270fa68a32b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -177,6 +177,9 @@ extern unsigned int kobjsize(const void *objp);
  */
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
+/* This mask defines which mm->def_flags a process can inherit its parent */
+#define VM_INIT_DEF_MASK	VM_NOHUGEPAGE
+
 /*
  * mapping from the currently active vm_flags protection bits (the
  * low four bits) to a page protection mask..
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 289760f424aa..58afc04c107e 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -149,4 +149,7 @@
 
 #define PR_GET_TID_ADDRESS	40
 
+#define PR_SET_THP_DISABLE	41
+#define PR_GET_THP_DISABLE	42
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index abc45890f0a5..e40c0a01d5a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -530,8 +530,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
-	mm->flags = (current->mm) ?
-		(current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
 	mm->core_state = NULL;
 	atomic_long_set(&mm->nr_ptes, 0);
 	memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
@@ -540,8 +538,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 	mm_init_owner(mm, p);
 	clear_tlb_flush_pending(mm);
 
-	if (likely(!mm_alloc_pgd(mm))) {
+	if (current->mm) {
+		mm->flags = current->mm->flags & MMF_INIT_MASK;
+		mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+	} else {
+		mm->flags = default_dump_filter;
 		mm->def_flags = 0;
+	}
+
+	if (likely(!mm_alloc_pgd(mm))) {
 		mmu_notifier_mm_init(mm);
 		return mm;
 	}
diff --git a/kernel/sys.c b/kernel/sys.c
index adaeab6f7a87..fba0f29401ea 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1996,6 +1996,21 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		if (arg2 || arg3 || arg4 || arg5)
 			return -EINVAL;
 		return current->no_new_privs ? 1 : 0;
+	case PR_GET_THP_DISABLE:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+		break;
+	case PR_SET_THP_DISABLE:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		down_write(&me->mm->mmap_sem);
+		if (arg2)
+			me->mm->def_flags |= VM_NOHUGEPAGE;
+		else
+			me->mm->def_flags &= ~VM_NOHUGEPAGE;
+		up_write(&me->mm->mmap_sem);
+		break;
 	default:
 		error = -EINVAL;
 		break;
-- 
cgit v1.3-14-g43fede


From 615d6e8756c87149f2d4c1b93d471bca002bd849 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Mon, 7 Apr 2014 15:37:25 -0700
Subject: mm: per-thread vma caching

This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed.  There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma().  Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.

We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality.  On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.

The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number.  The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed.  Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question.  Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:

1) System bootup: Most programs are single threaded, so the per-thread
   scheme does improve ~50% hit rate by just adding a few more slots to
   the cache.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 50.61%   | 19.90            |
| patched        | 73.45%   | 13.58            |
+----------------+----------+------------------+

2) Kernel build: This one is already pretty good with the current
   approach as we're dealing with good locality.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 75.28%   | 11.03            |
| patched        | 88.09%   | 9.31             |
+----------------+----------+------------------+

3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 70.66%   | 17.14            |
| patched        | 91.15%   | 12.57            |
+----------------+----------+------------------+

4) Ebizzy: There's a fair amount of variation from run to run, but this
   approach always shows nearly perfect hit rates, while baseline is just
   about non-existent.  The amounts of cycles can fluctuate between
   anywhere from ~60 to ~116 for the baseline scheme, but this approach
   reduces it considerably.  For instance, with 80 threads:

+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline       | 1.06%    | 91.54            |
| patched        | 99.97%   | 14.18            |
+----------------+----------+------------------+

[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/unicore32/include/asm/mmu_context.h |   4 +-
 fs/exec.c                                |   5 +-
 fs/proc/task_mmu.c                       |   3 +-
 include/linux/mm_types.h                 |   4 +-
 include/linux/sched.h                    |   7 ++
 include/linux/vmacache.h                 |  38 +++++++++++
 kernel/debug/debug_core.c                |  14 +++-
 kernel/fork.c                            |   7 +-
 mm/Makefile                              |   2 +-
 mm/mmap.c                                |  55 ++++++++-------
 mm/nommu.c                               |  24 ++++---
 mm/vmacache.c                            | 112 +++++++++++++++++++++++++++++++
 12 files changed, 231 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/vmacache.h
 create mode 100644 mm/vmacache.c

(limited to 'kernel')

diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
index fb5e4c658f7a..ef470a7a3d0f 100644
--- a/arch/unicore32/include/asm/mmu_context.h
+++ b/arch/unicore32/include/asm/mmu_context.h
@@ -14,6 +14,8 @@
 
 #include <linux/compiler.h>
 #include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/io.h>
 
 #include <asm/cacheflush.h>
@@ -73,7 +75,7 @@ do { \
 		else \
 			mm->mmap = NULL; \
 		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
+		vmacache_invalidate(mm); \
 		mm->map_count--; \
 		remove_vma(high_vma); \
 	} \
diff --git a/fs/exec.c b/fs/exec.c
index 25dfeba6d55f..b60ccf969a8b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
@@ -822,7 +823,7 @@ EXPORT_SYMBOL(read_code);
 static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
-	struct mm_struct * old_mm, *active_mm;
+	struct mm_struct *old_mm, *active_mm;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
@@ -848,6 +849,8 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
+	tsk->mm->vmacache_seqnum = 0;
+	vmacache_flush(tsk);
 	task_unlock(tsk);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fb52b548080d..442177b1119a 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/hugetlb.h>
 #include <linux/huge_mm.h>
 #include <linux/mount.h>
@@ -152,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
 
 	/*
 	 * We remember last_addr rather than next_addr to hit with
-	 * mmap_cache most of the time. We have zero last_addr at
+	 * vmacache most of the time. We have zero last_addr at
 	 * the beginning and also after lseek. We will have -1 last_addr
 	 * after the end of the vmas.
 	 */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 290901a8c1de..2b58d192ea24 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -342,9 +342,9 @@ struct mm_rss_stat {
 
 struct kioctx_table;
 struct mm_struct {
-	struct vm_area_struct * mmap;		/* list of VMAs */
+	struct vm_area_struct *mmap;		/* list of VMAs */
 	struct rb_root mm_rb;
-	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+	u32 vmacache_seqnum;                   /* per-thread vmacache */
 #ifdef CONFIG_MMU
 	unsigned long (*get_unmapped_area) (struct file *filp,
 				unsigned long addr, unsigned long len,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7cb07fd26680..642477dd814a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -132,6 +132,10 @@ struct perf_event_context;
 struct blk_plug;
 struct filename;
 
+#define VMACACHE_BITS 2
+#define VMACACHE_SIZE (1U << VMACACHE_BITS)
+#define VMACACHE_MASK (VMACACHE_SIZE - 1)
+
 /*
  * List of flags we want to share for kernel threads,
  * if only because they are not used by them anyway.
@@ -1235,6 +1239,9 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
 	unsigned brk_randomized:1;
 #endif
+	/* per-thread vma caching */
+	u32 vmacache_seqnum;
+	struct vm_area_struct *vmacache[VMACACHE_SIZE];
 #if defined(SPLIT_RSS_COUNTING)
 	struct task_rss_stat	rss_stat;
 #endif
diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h
new file mode 100644
index 000000000000..c3fa0fd43949
--- /dev/null
+++ b/include/linux/vmacache.h
@@ -0,0 +1,38 @@
+#ifndef __LINUX_VMACACHE_H
+#define __LINUX_VMACACHE_H
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+/*
+ * Hash based on the page number. Provides a good hit rate for
+ * workloads with good locality and those with random accesses as well.
+ */
+#define VMACACHE_HASH(addr) ((addr >> PAGE_SHIFT) & VMACACHE_MASK)
+
+static inline void vmacache_flush(struct task_struct *tsk)
+{
+	memset(tsk->vmacache, 0, sizeof(tsk->vmacache));
+}
+
+extern void vmacache_flush_all(struct mm_struct *mm);
+extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma);
+extern struct vm_area_struct *vmacache_find(struct mm_struct *mm,
+						    unsigned long addr);
+
+#ifndef CONFIG_MMU
+extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end);
+#endif
+
+static inline void vmacache_invalidate(struct mm_struct *mm)
+{
+	mm->vmacache_seqnum++;
+
+	/* deal with overflows */
+	if (unlikely(mm->vmacache_seqnum == 0))
+		vmacache_flush_all(mm);
+}
+
+#endif /* __LINUX_VMACACHE_H */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 99982a70ddad..2956c8da1605 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -49,6 +49,7 @@
 #include <linux/pid.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/rcupdate.h>
 
 #include <asm/cacheflush.h>
@@ -224,10 +225,17 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
 	if (!CACHE_FLUSH_IS_SAFE)
 		return;
 
-	if (current->mm && current->mm->mmap_cache) {
-		flush_cache_range(current->mm->mmap_cache,
-				  addr, addr + BREAK_INSTR_SIZE);
+	if (current->mm) {
+		int i;
+
+		for (i = 0; i < VMACACHE_SIZE; i++) {
+			if (!current->vmacache[i])
+				continue;
+			flush_cache_range(current->vmacache[i],
+					  addr, addr + BREAK_INSTR_SIZE);
+		}
 	}
+
 	/* Force flush instruction cache if it was outside the mm */
 	flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index e40c0a01d5a6..bc0e96b78dfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -28,6 +28,8 @@
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
@@ -364,7 +366,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
-	mm->mmap_cache = NULL;
+	mm->vmacache_seqnum = 0;
 	mm->map_count = 0;
 	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
@@ -882,6 +884,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
 	if (!oldmm)
 		return 0;
 
+	/* initialize the new vmacache entries */
+	vmacache_flush(tsk);
+
 	if (clone_flags & CLONE_VM) {
 		atomic_inc(&oldmm->mm_users);
 		mm = oldmm;
diff --git a/mm/Makefile b/mm/Makefile
index cdd741519ee0..23a6f7e23019 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   readahead.o swap.o truncate.o vmscan.o shmem.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
-			   compaction.o balloon_compaction.o \
+			   compaction.o balloon_compaction.o vmacache.o \
 			   interval_tree.o list_lru.o workingset.o $(mmu-y)
 
 obj-y += init-mm.o
diff --git a/mm/mmap.c b/mm/mmap.c
index 46433e137abc..b1202cf81f4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
@@ -681,8 +682,9 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 	prev->vm_next = next = vma->vm_next;
 	if (next)
 		next->vm_prev = prev;
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = prev;
+
+	/* Kill the cache */
+	vmacache_invalidate(mm);
 }
 
 /*
@@ -1989,34 +1991,33 @@ EXPORT_SYMBOL(get_unmapped_area);
 /* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
-	struct vm_area_struct *vma = NULL;
+	struct rb_node *rb_node;
+	struct vm_area_struct *vma;
 
 	/* Check the cache first. */
-	/* (Cache hit rate is typically around 35%.) */
-	vma = ACCESS_ONCE(mm->mmap_cache);
-	if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-		struct rb_node *rb_node;
+	vma = vmacache_find(mm, addr);
+	if (likely(vma))
+		return vma;
 
-		rb_node = mm->mm_rb.rb_node;
-		vma = NULL;
+	rb_node = mm->mm_rb.rb_node;
+	vma = NULL;
 
-		while (rb_node) {
-			struct vm_area_struct *vma_tmp;
-
-			vma_tmp = rb_entry(rb_node,
-					   struct vm_area_struct, vm_rb);
-
-			if (vma_tmp->vm_end > addr) {
-				vma = vma_tmp;
-				if (vma_tmp->vm_start <= addr)
-					break;
-				rb_node = rb_node->rb_left;
-			} else
-				rb_node = rb_node->rb_right;
-		}
-		if (vma)
-			mm->mmap_cache = vma;
+	while (rb_node) {
+		struct vm_area_struct *tmp;
+
+		tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
+
+		if (tmp->vm_end > addr) {
+			vma = tmp;
+			if (tmp->vm_start <= addr)
+				break;
+			rb_node = rb_node->rb_left;
+		} else
+			rb_node = rb_node->rb_right;
 	}
+
+	if (vma)
+		vmacache_update(addr, vma);
 	return vma;
 }
 
@@ -2388,7 +2389,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 	} else
 		mm->highest_vm_end = prev ? prev->vm_end : 0;
 	tail_vma->vm_next = NULL;
-	mm->mmap_cache = NULL;		/* Kill the cache. */
+
+	/* Kill the cache */
+	vmacache_invalidate(mm);
 }
 
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index e19482533ce3..5d3f3524bbdc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -15,6 +15,7 @@
 
 #include <linux/export.h>
 #include <linux/mm.h>
+#include <linux/vmacache.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/file.h>
@@ -768,16 +769,23 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
  */
 static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
+	int i;
 	struct address_space *mapping;
 	struct mm_struct *mm = vma->vm_mm;
+	struct task_struct *curr = current;
 
 	kenter("%p", vma);
 
 	protect_vma(vma, 0);
 
 	mm->map_count--;
-	if (mm->mmap_cache == vma)
-		mm->mmap_cache = NULL;
+	for (i = 0; i < VMACACHE_SIZE; i++) {
+		/* if the vma is cached, invalidate the entire cache */
+		if (curr->vmacache[i] == vma) {
+			vmacache_invalidate(curr->mm);
+			break;
+		}
+	}
 
 	/* remove the VMA from the mapping */
 	if (vma->vm_file) {
@@ -825,8 +833,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 	struct vm_area_struct *vma;
 
 	/* check the cache first */
-	vma = ACCESS_ONCE(mm->mmap_cache);
-	if (vma && vma->vm_start <= addr && vma->vm_end > addr)
+	vma = vmacache_find(mm, addr);
+	if (likely(vma))
 		return vma;
 
 	/* trawl the list (there may be multiple mappings in which addr
@@ -835,7 +843,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 		if (vma->vm_start > addr)
 			return NULL;
 		if (vma->vm_end > addr) {
-			mm->mmap_cache = vma;
+			vmacache_update(addr, vma);
 			return vma;
 		}
 	}
@@ -874,8 +882,8 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
 	unsigned long end = addr + len;
 
 	/* check the cache first */
-	vma = mm->mmap_cache;
-	if (vma && vma->vm_start == addr && vma->vm_end == end)
+	vma = vmacache_find_exact(mm, addr, end);
+	if (vma)
 		return vma;
 
 	/* trawl the list (there may be multiple mappings in which addr
@@ -886,7 +894,7 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
 		if (vma->vm_start > addr)
 			return NULL;
 		if (vma->vm_end == end) {
-			mm->mmap_cache = vma;
+			vmacache_update(addr, vma);
 			return vma;
 		}
 	}
diff --git a/mm/vmacache.c b/mm/vmacache.c
new file mode 100644
index 000000000000..d4224b397c0e
--- /dev/null
+++ b/mm/vmacache.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2014 Davidlohr Bueso.
+ */
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmacache.h>
+
+/*
+ * Flush vma caches for threads that share a given mm.
+ *
+ * The operation is safe because the caller holds the mmap_sem
+ * exclusively and other threads accessing the vma cache will
+ * have mmap_sem held at least for read, so no extra locking
+ * is required to maintain the vma cache.
+ */
+void vmacache_flush_all(struct mm_struct *mm)
+{
+	struct task_struct *g, *p;
+
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		/*
+		 * Only flush the vmacache pointers as the
+		 * mm seqnum is already set and curr's will
+		 * be set upon invalidation when the next
+		 * lookup is done.
+		 */
+		if (mm == p->mm)
+			vmacache_flush(p);
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This task may be accessing a foreign mm via (for example)
+ * get_user_pages()->find_vma().  The vmacache is task-local and this
+ * task's vmacache pertains to a different mm (ie, its own).  There is
+ * nothing we can do here.
+ *
+ * Also handle the case where a kernel thread has adopted this mm via use_mm().
+ * That kernel thread's vmacache is not applicable to this mm.
+ */
+static bool vmacache_valid_mm(struct mm_struct *mm)
+{
+	return current->mm == mm && !(current->flags & PF_KTHREAD);
+}
+
+void vmacache_update(unsigned long addr, struct vm_area_struct *newvma)
+{
+	if (vmacache_valid_mm(newvma->vm_mm))
+		current->vmacache[VMACACHE_HASH(addr)] = newvma;
+}
+
+static bool vmacache_valid(struct mm_struct *mm)
+{
+	struct task_struct *curr;
+
+	if (!vmacache_valid_mm(mm))
+		return false;
+
+	curr = current;
+	if (mm->vmacache_seqnum != curr->vmacache_seqnum) {
+		/*
+		 * First attempt will always be invalid, initialize
+		 * the new cache for this task here.
+		 */
+		curr->vmacache_seqnum = mm->vmacache_seqnum;
+		vmacache_flush(curr);
+		return false;
+	}
+	return true;
+}
+
+struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr)
+{
+	int i;
+
+	if (!vmacache_valid(mm))
+		return NULL;
+
+	for (i = 0; i < VMACACHE_SIZE; i++) {
+		struct vm_area_struct *vma = current->vmacache[i];
+
+		if (vma && vma->vm_start <= addr && vma->vm_end > addr) {
+			BUG_ON(vma->vm_mm != mm);
+			return vma;
+		}
+	}
+
+	return NULL;
+}
+
+#ifndef CONFIG_MMU
+struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm,
+					   unsigned long start,
+					   unsigned long end)
+{
+	int i;
+
+	if (!vmacache_valid(mm))
+		return NULL;
+
+	for (i = 0; i < VMACACHE_SIZE; i++) {
+		struct vm_area_struct *vma = current->vmacache[i];
+
+		if (vma && vma->vm_start == start && vma->vm_end == end)
+			return vma;
+	}
+
+	return NULL;
+}
+#endif
-- 
cgit v1.3-14-g43fede


From 514ddb446c5c5a238eca32b7052b7a8accae4e93 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 7 Apr 2014 15:37:27 -0700
Subject: fork: collapse copy_flags into copy_process

copy_flags() does not use the clone_flags formal and can be collapsed
into copy_process() for cleaner code.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bc0e96b78dfd..c777964c0662 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1080,15 +1080,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	return 0;
 }
 
-static void copy_flags(unsigned long clone_flags, struct task_struct *p)
-{
-	unsigned long new_flags = p->flags;
-
-	new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
-	new_flags |= PF_FORKNOEXEC;
-	p->flags = new_flags;
-}
-
 SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
 	current->clear_child_tid = tidptr;
@@ -1238,7 +1229,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_count;
 
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
-	copy_flags(clone_flags, p);
+	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+	p->flags |= PF_FORKNOEXEC;
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
 	rcu_copy_process(p);
-- 
cgit v1.3-14-g43fede


From f0432d159601f96839f514f286eaa5b75c4112dc Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 7 Apr 2014 15:37:30 -0700
Subject: mm, mempolicy: remove per-process flag

PF_MEMPOLICY is an unnecessary optimization for CONFIG_SLAB users.
There's no significant performance degradation to checking
current->mempolicy rather than current->flags & PF_MEMPOLICY in the
allocation path, especially since this is considered unlikely().

Running TCP_RR with netperf-2.4.5 through localhost on 16 cpu machine with
64GB of memory and without a mempolicy:

	threads		before		after
	16		1249409		1244487
	32		1281786		1246783
	48		1239175		1239138
	64		1244642		1241841
	80		1244346		1248918
	96		1266436		1254316
	112		1307398		1312135
	128		1327607		1326502

Per-process flags are a scarce resource so we should free them up whenever
possible and make them available.  We'll be using it shortly for memcg oom
reserves.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  1 -
 include/linux/sched.h     |  1 -
 kernel/fork.c             |  1 -
 mm/mempolicy.c            | 31 -------------------------------
 mm/slab.c                 |  4 ++--
 5 files changed, 2 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index cfe55dfca015..3c1b968da0ca 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -143,7 +143,6 @@ extern void numa_policy_init(void);
 extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
 				enum mpol_rebind_step step);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
-extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 642477dd814a..6c70645eb3b6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1851,7 +1851,6 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
-#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_SUSPEND_TASK 0x80000000      /* this thread called freeze_processes and should not be frozen */
diff --git a/kernel/fork.c b/kernel/fork.c
index c777964c0662..e905e9c6b224 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1276,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		p->mempolicy = NULL;
 		goto bad_fork_cleanup_threadgroup_lock;
 	}
-	mpol_fix_fork_child_flag(p);
 #endif
 #ifdef CONFIG_CPUSETS
 	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0ad0ba31979f..78e1472933ea 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -795,36 +795,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
 	return err;
 }
 
-/*
- * Update task->flags PF_MEMPOLICY bit: set iff non-default
- * mempolicy.  Allows more rapid checking of this (combined perhaps
- * with other PF_* flag bits) on memory allocation hot code paths.
- *
- * If called from outside this file, the task 'p' should -only- be
- * a newly forked child not yet visible on the task list, because
- * manipulating the task flags of a visible task is not safe.
- *
- * The above limitation is why this routine has the funny name
- * mpol_fix_fork_child_flag().
- *
- * It is also safe to call this with a task pointer of current,
- * which the static wrapper mpol_set_task_struct_flag() does,
- * for use within this file.
- */
-
-void mpol_fix_fork_child_flag(struct task_struct *p)
-{
-	if (p->mempolicy)
-		p->flags |= PF_MEMPOLICY;
-	else
-		p->flags &= ~PF_MEMPOLICY;
-}
-
-static void mpol_set_task_struct_flag(void)
-{
-	mpol_fix_fork_child_flag(current);
-}
-
 /* Set the process memory policy */
 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 			     nodemask_t *nodes)
@@ -861,7 +831,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	}
 	old = current->mempolicy;
 	current->mempolicy = new;
-	mpol_set_task_struct_flag();
 	if (new && new->mode == MPOL_INTERLEAVE &&
 	    nodes_weight(new->v.nodes))
 		current->il_next = first_node(new->v.nodes);
diff --git a/mm/slab.c b/mm/slab.c
index 4b17f4c2e92d..3db4cb06e32e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3027,7 +3027,7 @@ out:
 
 #ifdef CONFIG_NUMA
 /*
- * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
  *
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
@@ -3259,7 +3259,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	void *objp;
 
-	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
+	if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
 		objp = alternate_node_alloc(cache, flags);
 		if (objp)
 			goto out;
-- 
cgit v1.3-14-g43fede


From 539a13b47e462d28c48f076c63871580f694a366 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 7 Apr 2014 15:37:32 -0700
Subject: res_counter: remove interface for locked charging and uncharging

The res_counter_{charge,uncharge}_locked() variants are not used in the
kernel outside of the resource counter code itself, so remove the
interface.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Tim Hockin <thockin@google.com>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/resource_counter.txt | 12 ++----------
 include/linux/res_counter.h                |  6 +-----
 kernel/res_counter.c                       | 23 ++++++++++++-----------
 3 files changed, 15 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index 5108afb3645c..762ca54eb929 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -76,15 +76,7 @@ to work with it.
 	limit_fail_at parameter is set to the particular res_counter element
 	where the charging failed.
 
- d. int res_counter_charge_locked
-			(struct res_counter *rc, unsigned long val, bool force)
-
-	The same as res_counter_charge(), but it must not acquire/release the
-	res_counter->lock internally (it must be called with res_counter->lock
-	held). The force parameter indicates whether we can bypass the limit.
-
- e. u64 res_counter_uncharge[_locked]
-			(struct res_counter *rc, unsigned long val)
+ d. u64 res_counter_uncharge(struct res_counter *rc, unsigned long val)
 
 	When a resource is released (freed) it should be de-accounted
 	from the resource counter it was accounted to.  This is called
@@ -93,7 +85,7 @@ to work with it.
 
 	The _locked routines imply that the res_counter->lock is taken.
 
- f. u64 res_counter_uncharge_until
+ e. u64 res_counter_uncharge_until
 		(struct res_counter *rc, struct res_counter *top,
 		 unsigned long val)
 
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 201a69749659..56b7bc32db4f 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -104,15 +104,13 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  *       units, e.g. numbers, bytes, Kbytes, etc
  *
  * returns 0 on success and <0 if the counter->usage will exceed the
- * counter->limit _locked call expects the counter->lock to be taken
+ * counter->limit
  *
  * charge_nofail works the same, except that it charges the resource
  * counter unconditionally, and returns < 0 if the after the current
  * charge we are over limit.
  */
 
-int __must_check res_counter_charge_locked(struct res_counter *counter,
-					   unsigned long val, bool force);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 int res_counter_charge_nofail(struct res_counter *counter,
@@ -125,12 +123,10 @@ int res_counter_charge_nofail(struct res_counter *counter,
  * @val: the amount of the resource
  *
  * these calls check for usage underflow and show a warning on the console
- * _locked call expects the counter->lock to be taken
  *
  * returns the total charges still present in @counter.
  */
 
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
 u64 res_counter_uncharge(struct res_counter *counter, unsigned long val);
 
 u64 res_counter_uncharge_until(struct res_counter *counter,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 4aa8a305aede..51dbac6a3633 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,8 +22,18 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 	counter->parent = parent;
 }
 
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
-			      bool force)
+static u64 res_counter_uncharge_locked(struct res_counter *counter,
+				       unsigned long val)
+{
+	if (WARN_ON(counter->usage < val))
+		val = counter->usage;
+
+	counter->usage -= val;
+	return counter->usage;
+}
+
+static int res_counter_charge_locked(struct res_counter *counter,
+				     unsigned long val, bool force)
 {
 	int ret = 0;
 
@@ -86,15 +96,6 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 	return __res_counter_charge(counter, val, limit_fail_at, true);
 }
 
-u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
-{
-	if (WARN_ON(counter->usage < val))
-		val = counter->usage;
-
-	counter->usage -= val;
-	return counter->usage;
-}
-
 u64 res_counter_uncharge_until(struct res_counter *counter,
 			       struct res_counter *top,
 			       unsigned long val)
-- 
cgit v1.3-14-g43fede


From c39df5fa37b0623589508c95515b4aa1531c524e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:29 -0700
Subject: exit: call disassociate_ctty() before exit_task_namespaces()

Commit 8aac62706ada ("move exit_task_namespaces() outside of
exit_notify()") breaks pppd and the exiting service crashes the kernel:

    BUG: unable to handle kernel NULL pointer dereference at 0000000000000028
    IP: ppp_register_channel+0x13/0x20 [ppp_generic]
    Call Trace:
      ppp_asynctty_open+0x12b/0x170 [ppp_async]
      tty_ldisc_open.isra.2+0x27/0x60
      tty_ldisc_hangup+0x1e3/0x220
      __tty_hangup+0x2c4/0x440
      disassociate_ctty+0x61/0x270
      do_exit+0x7f2/0xa50

ppp_register_channel() needs ->net_ns and current->nsproxy == NULL.

Move disassociate_ctty() before exit_task_namespaces(), it doesn't make
sense to delay it after perf_event_exit_task() or cgroup_exit().

This also allows to use task_work_add() inside the (nontrivial) code
paths in disassociate_ctty().

Investigated by Peter Hurley.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Sree Harsha Totakura <sreeharsha@totakura.in>
Cc: Peter Hurley <peter@hurleysoftware.com>
Cc: Sree Harsha Totakura <sreeharsha@totakura.in>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>	[v3.10+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6480d1c85d7a..11f9e39a7368 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -784,6 +784,8 @@ void do_exit(long code)
 	exit_shm(tsk);
 	exit_files(tsk);
 	exit_fs(tsk);
+	if (group_dead)
+		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
 	check_stack_usage();
@@ -799,13 +801,9 @@ void do_exit(long code)
 
 	cgroup_exit(tsk);
 
-	if (group_dead)
-		disassociate_ctty(1);
-
 	module_put(task_thread_info(tsk)->exec_domain->module);
 
 	proc_exit_connector(tsk);
-
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
-- 
cgit v1.3-14-g43fede


From 4bcb8232cf4eb061b086c10f56b6808adcdb5a93 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:30 -0700
Subject: exit: move check_stack_usage() to the end of do_exit()

It is not clear why check_stack_usage() is called so early and thus it
never checks the stack usage in, say, exit_notify() or
flush_ptrace_hw_breakpoint() or other functions which are only called by
do_exit().

Move the callsite down to the last preempt_disable/schedule.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 11f9e39a7368..171c9a9d7b00 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -788,7 +788,6 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
-	check_stack_usage();
 	exit_thread();
 
 	/*
@@ -842,6 +841,7 @@ void do_exit(long code)
 
 	validate_creds_for_do_exit(tsk);
 
+	check_stack_usage();
 	preempt_disable();
 	if (tsk->nr_dirtied)
 		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
-- 
cgit v1.3-14-g43fede


From ef9823939e5acd5d323ff61fbc427ef998dd203e Mon Sep 17 00:00:00 2001
From: Guillaume Morin <guillaume@morinfr.org>
Date: Mon, 7 Apr 2014 15:38:31 -0700
Subject: kernel/exit.c: call proc_exit_connector() after exit_state is set

The process events connector delivers a notification when a process
exits.  This is really convenient for a process that spawns and wants to
monitor its children through an epoll-able() interface.

Unfortunately, there is a small window between when the event is
delivered and the child become wait()-able.

This is creates a race if the parent wants to make sure that it knows
about the exit, e.g

pid_t pid = fork();
if (pid > 0) {
	register_interest_for_pid(pid);
	if (waitpid(pid, NULL, WNOHANG) > 0)
	{
	  /* We might have raced with exit() */
	}
	return;
}

/* Child */
execve(...)

register_interest_for_pid() would be telling the the connector socket
reader to pay attention to events related to pid.

Though this is not a bug, I think it would make the connector a bit more
usable if this race was closed by simply moving the call to
proc_exit_connector() from just before exit_notify() to right after.

Oleg said:

: Even with this patch the code above is still "racy" if the child is
: multi-threaded.  Plus it should obviously filter-out subthreads.  And
: afaics there is no way to make it reliable, even if you change the code
: above so that waitpid() is called only after the last thread exits WNOHANG
: still can fail.

Signed-off-by: Guillaume Morin <guillaume@morinfr.org>
Cc: Matt Helsley <matt.helsley@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 171c9a9d7b00..decf648574f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -802,13 +802,13 @@ void do_exit(long code)
 
 	module_put(task_thread_info(tsk)->exec_domain->module);
 
-	proc_exit_connector(tsk);
 	/*
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
 
 	exit_notify(tsk, group_dead);
+	proc_exit_connector(tsk);
 #ifdef CONFIG_NUMA
 	task_lock(tsk);
 	mpol_put(tsk->mempolicy);
-- 
cgit v1.3-14-g43fede


From dfccbb5e49a621c1b21a62527d61fc4305617aca Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:41 -0700
Subject: wait: fix reparent_leader() vs EXIT_DEAD->EXIT_ZOMBIE race

wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and
drops tasklist_lock.  If this task is not the natural child and it is
traced, we change its state back to EXIT_ZOMBIE for ->real_parent.

The last transition is racy, this is even documented in 50b8d257486a
"ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE
race".  wait_consider_task() tries to detect this transition and clear
->notask_error but we can't rely on ptrace_reparented(), debugger can
exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE.

And there is another problem which were missed before: this transition
can also race with reparent_leader() which doesn't reset >exit_signal if
EXIT_DEAD, assuming that this task must be reaped by someone else.  So
the tracee can be re-parented with ->exit_signal != SIGCHLD, and if
/sbin/init doesn't use __WALL it becomes unreapable.

Change reparent_leader() to update ->exit_signal even if EXIT_DEAD.
Note: this is the simple temporary hack for -stable, it doesn't try to
solve all problems, it will be reverted by the next changes.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Kratochvil <jan.kratochvil@redhat.com>
Reported-by: Michal Schmidt <mschmidt@redhat.com>
Tested-by: Michal Schmidt <mschmidt@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index decf648574f6..e354cbb13a9b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -560,9 +560,6 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
-
-	if (p->exit_state == EXIT_DEAD)
-		return;
 	/*
 	 * If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -570,9 +567,19 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	if (same_thread_group(p->real_parent, father))
 		return;
 
-	/* We don't want people slaying init.  */
+	/*
+	 * We don't want people slaying init.
+	 *
+	 * Note: we do this even if it is EXIT_DEAD, wait_task_zombie()
+	 * can change ->exit_state to EXIT_ZOMBIE. If this is the final
+	 * state, do_notify_parent() was already called and ->exit_signal
+	 * doesn't matter.
+	 */
 	p->exit_signal = SIGCHLD;
 
+	if (p->exit_state == EXIT_DEAD)
+		return;
+
 	/* If it has exited notify the new parent about this child's death. */
 	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-- 
cgit v1.3-14-g43fede


From abd50b39e783e1b6c75c7534c37f1eb2d94a89cd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:42 -0700
Subject: wait: introduce EXIT_TRACE to avoid the racy EXIT_DEAD->EXIT_ZOMBIE
 transition

wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and
drops tasklist_lock.  If this task is not the natural child and it is
traced, we change its state back to EXIT_ZOMBIE for ->real_parent.

The last transition is racy, this is even documented in 50b8d257486a
"ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE
race".  wait_consider_task() tries to detect this transition and clear
->notask_error but we can't rely on ptrace_reparented(), debugger can
exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE.

And there is another problem which were missed before: this transition
can also race with reparent_leader() which doesn't reset >exit_signal if
EXIT_DEAD, assuming that this task must be reaped by someone else.  So
the tracee can be re-parented with ->exit_signal != SIGCHLD, and if
/sbin/init doesn't use __WALL it becomes unreapable.  This was fixed by
the previous commit, but it was the temporary hack.

1. Add the new exit_state, EXIT_TRACE. It means that the task is the
   traced zombie, debugger is going to detach and notify its natural
   parent.

   This new state is actually EXIT_ZOMBIE | EXIT_DEAD. This way we
   can avoid the changes in proc/kgdb code, get_task_state() still
   reports "X (dead)" in this case.

   Note: with or without this change userspace can see Z -> X -> Z
   transition. Not really bad, but probably makes sense to fix.

2. Change wait_task_zombie() to use EXIT_TRACE instead of EXIT_DEAD
   if we need to notify the ->real_parent.

3. Revert the previous hack in reparent_leader(), now that EXIT_DEAD
   is always the final state we can safely ignore such a task.

4. Change wait_consider_task() to check EXIT_TRACE separately and kill
   the racy and no longer needed ptrace_reparented() case.

   If ptrace == T an EXIT_TRACE thread should be simply ignored, the
   owner of this state is going to ptrace_unlink() this task. We can
   pretend that it was already removed from ->ptraced list.

   Otherwise we should skip this thread too but clear ->notask_error,
   we must be the natural parent and debugger is going to untrace and
   notify us. IOW, this doesn't differ from "EXIT_ZOMBIE && p->ptrace"
   even if the task was already untraced.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Kratochvil <jan.kratochvil@redhat.com>
Reported-by: Michal Schmidt <mschmidt@redhat.com>
Tested-by: Michal Schmidt <mschmidt@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 kernel/exit.c         | 50 +++++++++++++++++++++-----------------------------
 2 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8497059f88c..7781de5e5e7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -212,6 +212,7 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 /* in tsk->exit_state */
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
+#define EXIT_TRACE		(EXIT_ZOMBIE | EXIT_DEAD)
 /* in tsk->state again */
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
diff --git a/kernel/exit.c b/kernel/exit.c
index e354cbb13a9b..022a0ff17318 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -560,6 +560,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 				struct list_head *dead)
 {
 	list_move_tail(&p->sibling, &p->real_parent->children);
+
+	if (p->exit_state == EXIT_DEAD)
+		return;
 	/*
 	 * If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -567,19 +570,9 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p,
 	if (same_thread_group(p->real_parent, father))
 		return;
 
-	/*
-	 * We don't want people slaying init.
-	 *
-	 * Note: we do this even if it is EXIT_DEAD, wait_task_zombie()
-	 * can change ->exit_state to EXIT_ZOMBIE. If this is the final
-	 * state, do_notify_parent() was already called and ->exit_signal
-	 * doesn't matter.
-	 */
+	/* We don't want people slaying init. */
 	p->exit_signal = SIGCHLD;
 
-	if (p->exit_state == EXIT_DEAD)
-		return;
-
 	/* If it has exited notify the new parent about this child's death. */
 	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
@@ -1043,17 +1036,13 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		return wait_noreap_copyout(wo, p, pid, uid, why, status);
 	}
 
+	traced = ptrace_reparented(p);
 	/*
-	 * Try to move the task's state to DEAD
-	 * only one thread is allowed to do this:
+	 * Move the task's state to DEAD/TRACE, only one thread can do this.
 	 */
-	state = xchg(&p->exit_state, EXIT_DEAD);
-	if (state != EXIT_ZOMBIE) {
-		BUG_ON(state != EXIT_DEAD);
+	state = traced ? EXIT_TRACE : EXIT_DEAD;
+	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
 		return 0;
-	}
-
-	traced = ptrace_reparented(p);
 	/*
 	 * It can be ptraced but not reparented, check
 	 * thread_group_leader() to filter out sub-threads.
@@ -1114,7 +1103,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 
 	/*
 	 * Now we are sure this task is interesting, and no other
-	 * thread can reap it because we set its state to EXIT_DEAD.
+	 * thread can reap it because we its state == DEAD/TRACE.
 	 */
 	read_unlock(&tasklist_lock);
 
@@ -1159,14 +1148,14 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 		 * If this is not a sub-thread, notify the parent.
 		 * If parent wants a zombie, don't release it now.
 		 */
+		state = EXIT_DEAD;
 		if (thread_group_leader(p) &&
-		    !do_notify_parent(p, p->exit_signal)) {
-			p->exit_state = EXIT_ZOMBIE;
-			p = NULL;
-		}
+		    !do_notify_parent(p, p->exit_signal))
+			state = EXIT_ZOMBIE;
+		p->exit_state = state;
 		write_unlock_irq(&tasklist_lock);
 	}
-	if (p != NULL)
+	if (state == EXIT_DEAD)
 		release_task(p);
 
 	return retval;
@@ -1362,12 +1351,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 	}
 
 	/* dead body doesn't have much to contribute */
-	if (unlikely(p->exit_state == EXIT_DEAD)) {
+	if (unlikely(p->exit_state == EXIT_DEAD))
+		return 0;
+
+	if (unlikely(p->exit_state == EXIT_TRACE)) {
 		/*
-		 * But do not ignore this task until the tracer does
-		 * wait_task_zombie()->do_notify_parent().
+		 * ptrace == 0 means we are the natural parent. In this case
+		 * we should clear notask_error, debugger will notify us.
 		 */
-		if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+		if (likely(!ptrace))
 			wo->notask_error = 0;
 		return 0;
 	}
-- 
cgit v1.3-14-g43fede


From b436069059fede30ca31d4bf439cc86436ff5b1d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:43 -0700
Subject: wait: use EXIT_TRACE only if thread_group_leader(zombie)

wait_task_zombie() always uses EXIT_TRACE/ptrace_unlink() if
ptrace_reparented().  This is suboptimal and a bit confusing: we do not
need do_notify_parent(p) if !thread_group_leader(p) and in this case we
also do not need ptrace_unlink(), we can rely on ptrace_release_task().

Change wait_task_zombie() to check thread_group_leader() along with
ptrace_reparented() and simplify the final p->exit_state transition.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Michal Schmidt <mschmidt@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 022a0ff17318..4773ed990907 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1040,7 +1040,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	/*
 	 * Move the task's state to DEAD/TRACE, only one thread can do this.
 	 */
-	state = traced ? EXIT_TRACE : EXIT_DEAD;
+	state = traced && thread_group_leader(p) ? EXIT_TRACE : EXIT_DEAD;
 	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
 		return 0;
 	/*
@@ -1140,18 +1140,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
 	if (!retval)
 		retval = pid;
 
-	if (traced) {
+	if (state == EXIT_TRACE) {
 		write_lock_irq(&tasklist_lock);
 		/* We dropped tasklist, ptracer could die and untrace */
 		ptrace_unlink(p);
-		/*
-		 * If this is not a sub-thread, notify the parent.
-		 * If parent wants a zombie, don't release it now.
-		 */
-		state = EXIT_DEAD;
-		if (thread_group_leader(p) &&
-		    !do_notify_parent(p, p->exit_signal))
-			state = EXIT_ZOMBIE;
+
+		/* If parent wants a zombie, don't release it now */
+		state = EXIT_ZOMBIE;
+		if (do_notify_parent(p, p->exit_signal))
+			state = EXIT_DEAD;
 		p->exit_state = state;
 		write_unlock_irq(&tasklist_lock);
 	}
-- 
cgit v1.3-14-g43fede


From b3ab03160dfaf8ab78d476b670de319f4c1a5685 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:45 -0700
Subject: wait: completely ignore the EXIT_DEAD tasks

Now that EXIT_DEAD is the terminal state it doesn't make sense to call
eligible_child() or security_task_wait() if the task is really dead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Michal Schmidt <mschmidt@redhat.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4773ed990907..33cf8dba0a61 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1329,7 +1329,12 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
 static int wait_consider_task(struct wait_opts *wo, int ptrace,
 				struct task_struct *p)
 {
-	int ret = eligible_child(wo, p);
+	int ret;
+
+	if (unlikely(p->exit_state == EXIT_DEAD))
+		return 0;
+
+	ret = eligible_child(wo, p);
 	if (!ret)
 		return ret;
 
@@ -1347,10 +1352,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
-	/* dead body doesn't have much to contribute */
-	if (unlikely(p->exit_state == EXIT_DEAD))
-		return 0;
-
 	if (unlikely(p->exit_state == EXIT_TRACE)) {
 		/*
 		 * ptrace == 0 means we are the natural parent. In this case
-- 
cgit v1.3-14-g43fede


From 377d75dafa07ee0da64223c9169f4e17b26c2b9a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:47 -0700
Subject: wait: WSTOPPED|WCONTINUED hangs if a zombie child is traced by
 real_parent

"A zombie is only visible to its ptracer" logic in wait_consider_task()
is very wrong. Trivial test-case:

	#include <unistd.h>
	#include <sys/ptrace.h>
	#include <sys/wait.h>
	#include <assert.h>

	int main(void)
	{
		int child = fork();

		if (!child) {
			assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
			return 0x23;
		}

		assert(waitid(P_ALL, child, NULL, WEXITED | WNOWAIT) == 0);
		assert(waitid(P_ALL, 0, NULL, WSTOPPED) == -1);
		return 0;
	}

it hangs in waitpid(WSTOPPED) despite the fact it has a single zombie
child.  This is because wait_consider_task(ptrace => 0) sees p->ptrace and
cleares ->notask_error assuming that the debugger should detach and notify
us.

Change wait_consider_task(ptrace => 0) to pretend that ptrace == T if the
child is traced by us.  This really simplifies the logic and allows us to
do more fixes, see the next changes.  This also hides the unwanted group
stop state automatically, we can remove another ptrace_reparented() check.

Unfortunately, this adds the following behavioural changes:

	1. Before this patch wait(WEXITED | __WNOTHREAD) does not reap
	   a natural child if it is traced by the caller's sub-thread.

	   Hopefully nobody will ever notice this change, and I think
	   that nobody should rely on this behaviour anyway.

	2. SIGNAL_STOP_CONTINUED is no longer hidden from debugger if
	   it is real parent.

	   While this change comes as a side effect, I think it is good
	   by itself. The group continued state can not be consumed by
	   another process in this case, it doesn't depend on ptrace,
	   it doesn't make sense to hide it from real parent.

	   Perhaps we should add the thread_group_leader() check before
	   wait_task_continued()? May be, but this shouldn't depend on
	   ptrace_reparented().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 33cf8dba0a61..92d38d4da4b1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1362,6 +1362,22 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		return 0;
 	}
 
+	if (likely(!ptrace) && unlikely(p->ptrace)) {
+		/*
+		 * If it is traced by its real parent's group, just pretend
+		 * the caller is ptrace_do_wait() and reap this child if it
+		 * is zombie.
+		 *
+		 * This also hides group stop state from real parent; otherwise
+		 * a single stop can be reported twice as group and ptrace stop.
+		 * If a ptracer wants to distinguish these two events for its
+		 * own children it should create a separate process which takes
+		 * the role of real parent.
+		 */
+		if (!ptrace_reparented(p))
+			ptrace = 1;
+	}
+
 	/* slay zombie? */
 	if (p->exit_state == EXIT_ZOMBIE) {
 		/*
@@ -1402,19 +1418,6 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 		if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
 			wo->notask_error = 0;
 	} else {
-		/*
-		 * If @p is ptraced by a task in its real parent's group,
-		 * hide group stop/continued state when looking at @p as
-		 * the real parent; otherwise, a single stop can be
-		 * reported twice as group and ptrace stops.
-		 *
-		 * If a ptracer wants to distinguish the two events for its
-		 * own children, it should create a separate process which
-		 * takes the role of real parent.
-		 */
-		if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
-			return 0;
-
 		/*
 		 * @p is alive and it's gonna stop, continue or exit, so
 		 * there always is something to wait for.
-- 
cgit v1.3-14-g43fede


From 7c733eb3eac0e3d091aaf37c183d2175eeebfb2b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 7 Apr 2014 15:38:49 -0700
Subject: wait: WSTOPPED|WCONTINUED doesn't work if a zombie leader is traced
 by another process

Even if the main thread is dead the process still can stop/continue.
However, if the leader is ptraced wait_consider_task(ptrace => false)
always skips wait_task_stopped/wait_task_continued, so WSTOPPED or
WCONTINUED can never work for the natural parent in this case.

Move the "A zombie ptracee is only visible to its ptracer" check into the
"if (!delay_group_leader(p))" block.  ->notask_error is cleared by the
"fall through" code below.

This depends on the previous change, wait_task_stopped/continued must be
avoided if !delay_group_leader() and the tracer is ->real_parent.
Otherwise WSTOPPED|WEXITED could wrongly report "stopped" when the child
is already dead (single-threaded or not).  If it is traced by another task
then the "stopped" state is fine until the debugger detaches and reveals a
zombie state.

Stupid test-case:

	void *tfunc(void *arg)
	{
		sleep(1);	// wait for zombie leader
		raise(SIGSTOP);
		exit(0x13);
		return NULL;
	}

	int run_child(void)
	{
		pthread_t thread;

		if (!fork()) {
			int tracee = getppid();

			assert(ptrace(PTRACE_ATTACH, tracee, 0,0) == 0);
			do
				ptrace(PTRACE_CONT, tracee, 0,0);
			while (wait(NULL) > 0);

			return 0;
		}

		sleep(1);	// wait for PTRACE_ATTACH
		assert(pthread_create(&thread, NULL, tfunc, NULL) == 0);
		pthread_exit(NULL);
	}

	int main(void)
	{
		int child, stat;

		child = fork();
		if (!child)
			return run_child();

		assert(child == waitpid(-1, &stat, WSTOPPED));
		assert(stat == 0x137f);

		kill(child, SIGCONT);

		assert(child == waitpid(-1, &stat, WCONTINUED));
		assert(stat == 0xffff);

		assert(child == waitpid(-1, &stat, 0));
		assert(stat == 0x1300);

		return 0;
	}

Without this patch it hangs in waitpid(WSTOPPED), wait_task_stopped() is
never called.

Note: this doesn't fix all problems with a zombie delay_group_leader(),
WCONTINUED | WEXITED check is not exactly right.  debugger can't assume it
will be notified if another thread reaps the whole thread group.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 92d38d4da4b1..6ed6a1d552b5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1380,20 +1380,16 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
 
 	/* slay zombie? */
 	if (p->exit_state == EXIT_ZOMBIE) {
-		/*
-		 * A zombie ptracee is only visible to its ptracer.
-		 * Notification and reaping will be cascaded to the real
-		 * parent when the ptracer detaches.
-		 */
-		if (likely(!ptrace) && unlikely(p->ptrace)) {
-			/* it will become visible, clear notask_error */
-			wo->notask_error = 0;
-			return 0;
-		}
-
 		/* we don't reap group leaders with subthreads */
-		if (!delay_group_leader(p))
-			return wait_task_zombie(wo, p);
+		if (!delay_group_leader(p)) {
+			/*
+			 * A zombie ptracee is only visible to its ptracer.
+			 * Notification and reaping will be cascaded to the
+			 * real parent when the ptracer detaches.
+			 */
+			if (unlikely(ptrace) || likely(!p->ptrace))
+				return wait_task_zombie(wo, p);
+		}
 
 		/*
 		 * Allow access to stopped/continued state via zombie by
-- 
cgit v1.3-14-g43fede


From 80df28476505ed4e6701c3448c63c9229a50c655 Mon Sep 17 00:00:00 2001
From: Liu Hua <sdu.liu@huawei.com>
Date: Mon, 7 Apr 2014 15:38:57 -0700
Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec"

As sysctl_hung_task_timeout_sec is unsigned long, when this value is
larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in
watchdog will return immediately without sleep and with print :

  schedule_timeout: wrong timeout value ffffffffffffff83

and then the funtion watchdog will call schedule_timeout_interruptible
again and again.  The screen will be filled with

	"schedule_timeout: wrong timeout value ffffffffffffff83"

This patch does some check and correction in sysctl, to let the function
schedule_timeout_interruptible allways get the valid parameter.

Signed-off-by: Liu Hua <sdu.liu@huawei.com>
Tested-by: Satoru Takeuchi <satoru.takeuchi@gmail.com>
Cc: <stable@vger.kernel.org>	[3.4+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 1 +
 kernel/sysctl.c                 | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 271a09db6629..9886c3d57fc2 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -317,6 +317,7 @@ for more than this value report a warning.
 This file shows up if CONFIG_DETECT_HUNG_TASK is enabled.
 
 0: means infinite timeout - no checking done.
+Possible values to set are in range {0..LONG_MAX/HZ}.
 
 ==============================================================
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c14b547882e..74f5b580fe34 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -141,6 +141,11 @@ static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
 static const int cap_last_cap = CAP_LAST_CAP;
 
+/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */
+#ifdef CONFIG_DETECT_HUNG_TASK
+static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+#endif
+
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
 #endif
@@ -985,6 +990,7 @@ static struct ctl_table kern_table[] = {
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_dohung_task_timeout_secs,
+		.extra2		= &hung_task_timeout_max,
 	},
 	{
 		.procname	= "hung_task_warnings",
-- 
cgit v1.3-14-g43fede


From d7c0847fe3682a026ee6d147c5b6b8ab457fffc8 Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Mon, 7 Apr 2014 15:39:03 -0700
Subject: kernel/panic.c: display reason at end + pr_emerg

Currently, booting without initrd specified on 80x25 screen gives a call
trace followed by atkbd : Spurious ACK.  Original message ("VFS: Unable
to mount root fs") is not available.  Of course this could happen in
other situations...

This patch displays panic reason after call trace which could help lot
of people even if it's not the very last line on screen.

Also, convert all panic.c printk(KERN_EMERG to pr_emerg(

[akpm@linux-foundation.org: missed a couple of pr_ conversions]
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 79fd820bb5e8..d02fa9fef46a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -100,7 +100,7 @@ void panic(const char *fmt, ...)
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
-	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+	pr_emerg("Kernel panic - not syncing: %s\n", buf);
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 	/*
 	 * Avoid nested stack-dumping if a panic occurs during oops processing
@@ -141,7 +141,7 @@ void panic(const char *fmt, ...)
 		 * Delay timeout seconds before rebooting the machine.
 		 * We can't use the "normal" timers since we just panicked.
 		 */
-		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+		pr_emerg("Rebooting in %d seconds..", panic_timeout);
 
 		for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
 			touch_nmi_watchdog();
@@ -165,7 +165,7 @@ void panic(const char *fmt, ...)
 		extern int stop_a_enabled;
 		/* Make sure the user can actually press Stop-A (L1-A) */
 		stop_a_enabled = 1;
-		printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n");
+		pr_emerg("Press Stop-A (L1-A) to return to the boot prom\n");
 	}
 #endif
 #if defined(CONFIG_S390)
@@ -176,6 +176,7 @@ void panic(const char *fmt, ...)
 		disabled_wait(caller);
 	}
 #endif
+	pr_emerg("---[ end Kernel panic - not syncing: %s\n", buf);
 	local_irq_enable();
 	for (i = 0; ; i += PANIC_TIMER_STEP) {
 		touch_softlockup_watchdog();
@@ -276,8 +277,7 @@ unsigned long get_taint(void)
 void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 {
 	if (lockdep_ok == LOCKDEP_NOW_UNRELIABLE && __debug_locks_off())
-		printk(KERN_WARNING
-		       "Disabling lock debugging due to kernel taint\n");
+		pr_warn("Disabling lock debugging due to kernel taint\n");
 
 	set_bit(flag, &tainted_mask);
 }
@@ -382,8 +382,7 @@ late_initcall(init_oops_id);
 void print_oops_end_marker(void)
 {
 	init_oops_id();
-	printk(KERN_WARNING "---[ end trace %016llx ]---\n",
-		(unsigned long long)oops_id);
+	pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
 }
 
 /*
-- 
cgit v1.3-14-g43fede


From 52f5684c8e1ec7463192aba8e2916df49807511a Mon Sep 17 00:00:00 2001
From: Gideon Israel Dsouza <gidisrael@gmail.com>
Date: Mon, 7 Apr 2014 15:39:20 -0700
Subject: kernel: use macros from compiler.h instead of __attribute__((...))

To increase compiler portability there is <linux/compiler.h> which
provides convenience macros for various gcc constructs.  Eg: __weak for
__attribute__((weak)).  I've replaced all instances of gcc attributes
with the right macro in the kernel subsystem.

Signed-off-by: Gideon Israel Dsouza <gidisrael@gmail.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c             |  3 ++-
 kernel/kallsyms.c         | 11 ++++++-----
 kernel/kexec.c            |  5 +++--
 kernel/ksysfs.c           |  5 +++--
 kernel/power/power.h      |  3 ++-
 kernel/power/snapshot.c   |  3 ++-
 kernel/power/suspend.c    |  5 +++--
 kernel/power/swap.c       |  2 +-
 kernel/sched/clock.c      |  3 ++-
 kernel/sched/core.c       |  3 ++-
 kernel/signal.c           |  4 +++-
 kernel/time/timekeeping.c |  5 +++--
 kernel/trace/trace.h      |  3 ++-
 13 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e905e9c6b224..54a8d26f612f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -73,6 +73,7 @@
 #include <linux/signalfd.h>
 #include <linux/uprobes.h>
 #include <linux/aio.h>
+#include <linux/compiler.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -286,7 +287,7 @@ void __init fork_init(unsigned long mempages)
 		init_task.signal->rlim[RLIMIT_NPROC];
 }
 
-int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst,
+int __weak arch_dup_task_struct(struct task_struct *dst,
 					       struct task_struct *src)
 {
 	*dst = *src;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 3127ad52cdb2..cb0cf37dac3a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/ctype.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 #include <asm/sections.h>
 
@@ -36,8 +37,8 @@
  * These will be re-linked against their real values
  * during the second link stage.
  */
-extern const unsigned long kallsyms_addresses[] __attribute__((weak));
-extern const u8 kallsyms_names[] __attribute__((weak));
+extern const unsigned long kallsyms_addresses[] __weak;
+extern const u8 kallsyms_names[] __weak;
 
 /*
  * Tell the compiler that the count isn't in the small data section if the arch
@@ -46,10 +47,10 @@ extern const u8 kallsyms_names[] __attribute__((weak));
 extern const unsigned long kallsyms_num_syms
 __attribute__((weak, section(".rodata")));
 
-extern const u8 kallsyms_token_table[] __attribute__((weak));
-extern const u16 kallsyms_token_index[] __attribute__((weak));
+extern const u8 kallsyms_token_table[] __weak;
+extern const u16 kallsyms_token_index[] __weak;
 
-extern const unsigned long kallsyms_markers[] __attribute__((weak));
+extern const unsigned long kallsyms_markers[] __weak;
 
 static inline int is_kernel_inittext(unsigned long addr)
 {
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0d261c7db7b..c8380ad203bc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,6 +32,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/syscore_ops.h>
+#include <linux/compiler.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1551,10 +1552,10 @@ void vmcoreinfo_append_str(const char *fmt, ...)
  * provide an empty default implementation here -- architecture
  * code may override this
  */
-void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+void __weak arch_crash_save_vmcoreinfo(void)
 {}
 
-unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+unsigned long __weak paddr_vmcoreinfo_note(void)
 {
 	return __pa((unsigned long)(char *)&vmcoreinfo_note);
 }
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e660964086e2..2495a9b14ac8 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,7 @@
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/compiler.h>
 
 #include <linux/rcupdate.h>	/* rcu_expedited */
 
@@ -162,8 +163,8 @@ KERNEL_ATTR_RW(rcu_expedited);
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
-extern const void __start_notes __attribute__((weak));
-extern const void __stop_notes __attribute__((weak));
+extern const void __start_notes __weak;
+extern const void __stop_notes __weak;
 #define	notes_size (&__stop_notes - &__start_notes)
 
 static ssize_t notes_read(struct file *filp, struct kobject *kobj,
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 1ca753106557..15f37ea08719 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -2,6 +2,7 @@
 #include <linux/suspend_ioctls.h>
 #include <linux/utsname.h>
 #include <linux/freezer.h>
+#include <linux/compiler.h>
 
 struct swsusp_info {
 	struct new_utsname	uts;
@@ -11,7 +12,7 @@ struct swsusp_info {
 	unsigned long		image_pages;
 	unsigned long		pages;
 	unsigned long		size;
-} __attribute__((aligned(PAGE_SIZE)));
+} __aligned(PAGE_SIZE);
 
 #ifdef CONFIG_HIBERNATION
 /* kernel/power/snapshot.c */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 149e745eaa52..18fb7a2fb14b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/list.h>
 #include <linux/slab.h>
+#include <linux/compiler.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -155,7 +156,7 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
 struct linked_page {
 	struct linked_page *next;
 	char data[LINKED_PAGE_DATA_SIZE];
-} __attribute__((packed));
+} __packed;
 
 static inline void
 free_list_of_pages(struct linked_page *list, int clear_page_nosave)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 90b3d9366d1a..c3ad9cafe930 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,6 +26,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/ftrace.h>
 #include <trace/events/power.h>
+#include <linux/compiler.h>
 
 #include "power.h"
 
@@ -156,13 +157,13 @@ static int suspend_prepare(suspend_state_t state)
 }
 
 /* default implementation */
-void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+void __weak arch_suspend_disable_irqs(void)
 {
 	local_irq_disable();
 }
 
 /* default implementation */
-void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+void __weak arch_suspend_enable_irqs(void)
 {
 	local_irq_enable();
 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c33ed200410..8c9a4819f798 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -101,7 +101,7 @@ struct swsusp_header {
 	unsigned int flags;	/* Flags to pass to the "boot" kernel */
 	char	orig_sig[10];
 	char	sig[10];
-} __attribute__((packed));
+} __packed;
 
 static struct swsusp_header *swsusp_header;
 
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b30a2924ef14..3ef6451e972e 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -60,13 +60,14 @@
 #include <linux/sched.h>
 #include <linux/static_key.h>
 #include <linux/workqueue.h>
+#include <linux/compiler.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
  * This is default implementation.
  * Architectures and sub-architectures can override this.
  */
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
 {
 	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
 					* (NSEC_PER_SEC / HZ);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1d1b87b36778..80bd491b718c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
 #include <linux/init_task.h>
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
+#include <linux/compiler.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -6498,7 +6499,7 @@ static cpumask_var_t fallback_doms;
  * cpu core maps. It is supposed to return 1 if the topology changed
  * or 0 if it stayed the same.
  */
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
 {
 	return 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d4b05a229a6..6ea13c09ae56 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,6 +33,8 @@
 #include <linux/uprobes.h>
 #include <linux/compat.h>
 #include <linux/cn_proc.h>
+#include <linux/compiler.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -3618,7 +3620,7 @@ SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
 }
 #endif
 
-__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
+__weak const char *arch_vma_name(struct vm_area_struct *vma)
 {
 	return NULL;
 }
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5b40279ecd71..f7df8ea21707 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,6 +22,7 @@
 #include <linux/tick.h>
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
+#include <linux/compiler.h>
 
 #include "tick-internal.h"
 #include "ntp_internal.h"
@@ -760,7 +761,7 @@ u64 timekeeping_max_deferment(void)
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
+void __weak read_persistent_clock(struct timespec *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
@@ -775,7 +776,7 @@ void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __attribute__((weak)) read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock(struct timespec *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ffc314b7e92b..2e29d7ba5a52 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
 #include <linux/hw_breakpoint.h>
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
+#include <linux/compiler.h>
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #include <asm/unistd.h>		/* For NR_SYSCALLS	     */
@@ -1279,7 +1280,7 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter)	\
 	extern struct ftrace_event_call					\
-	__attribute__((__aligned__(4))) event_##call;
+	__aligned(4) event_##call;
 #undef FTRACE_ENTRY_DUP
 #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter)	\
 	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
-- 
cgit v1.3-14-g43fede


From 08f141d3dbddacb70aba1541bc5f950e466591e9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Mon, 7 Apr 2014 15:39:39 -0700
Subject: modules: use raw_cpu_write for initialization of per cpu refcount.

The initialization of a structure is not subject to synchronization.
The use of __this_cpu would trigger a false positive with the additional
preemption checks for __this_cpu ops.

So simply disable the check through the use of raw_cpu ops.

Trace:

  __this_cpu_write operation in preemptible [00000000] code: modprobe/286
  caller is __this_cpu_preempt_check+0x38/0x60
  CPU: 3 PID: 286 Comm: modprobe Tainted: GF            3.12.0-rc4+ #187
  Call Trace:
    dump_stack+0x4e/0x82
    check_preemption_disabled+0xec/0x110
    __this_cpu_preempt_check+0x38/0x60
    load_module+0xcfd/0x2650
    SyS_init_module+0xa6/0xd0
    tracesys+0xe1/0xe6

Signed-off-by: Christoph Lameter <cl@linux.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 29f7790eaa14..11869408f79b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -640,7 +640,7 @@ static int module_unload_init(struct module *mod)
 	INIT_LIST_HEAD(&mod->target_list);
 
 	/* Hold reference count during initialization. */
-	__this_cpu_write(mod->refptr->incs, 1);
+	raw_cpu_write(mod->refptr->incs, 1);
 
 	return 0;
 }
-- 
cgit v1.3-14-g43fede


From 64b47e8fdb40a0d46e8cf458dd3e24f8afa073f6 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Mon, 7 Apr 2014 15:39:45 -0700
Subject: lglock: map to spinlock when !CONFIG_SMP

When the system has only one CPU, lglock is effectively a spinlock; map
it directly to spinlock to eliminate the indirection and duplicate code.

In addition to removing overhead, this drops 1.6k of code with a
defconfig modified to have !CONFIG_SMP, and 1.1k with a minimal config.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Michal Marek <mmarek@suse.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: David Howells <dhowells@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lglock.h  | 16 ++++++++++++++++
 kernel/locking/Makefile |  3 ++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 96549abe8842..0081f000e34b 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -25,6 +25,8 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 
+#ifdef CONFIG_SMP
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 #else
@@ -57,4 +59,18 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu);
 void lg_global_lock(struct lglock *lg);
 void lg_global_unlock(struct lglock *lg);
 
+#else
+/* When !CONFIG_SMP, map lglock to spinlock */
+#define lglock spinlock
+#define DEFINE_LGLOCK(name) DEFINE_SPINLOCK(name)
+#define DEFINE_STATIC_LGLOCK(name) static DEFINE_SPINLOCK(name)
+#define lg_lock_init(lg, name) spin_lock_init(lg)
+#define lg_local_lock spin_lock
+#define lg_local_unlock spin_unlock
+#define lg_local_lock_cpu(lg, cpu) spin_lock(lg)
+#define lg_local_unlock_cpu(lg, cpu) spin_unlock(lg)
+#define lg_global_lock spin_lock
+#define lg_global_unlock spin_unlock
+#endif
+
 #endif
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 306a76b51e0f..b8bdcd4785b7 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
 
-obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
+obj-y += mutex.o semaphore.o rwsem.o mcs_spinlock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -14,6 +14,7 @@ ifeq ($(CONFIG_PROC_FS),y)
 obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
 endif
 obj-$(CONFIG_SMP) += spinlock.o
+obj-$(CONFIG_SMP) += lglock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-- 
cgit v1.3-14-g43fede


From de7b2973903c6cc50b31ee5682a69b2219b9919d Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Tue, 8 Apr 2014 17:26:21 -0400
Subject: tracepoint: Use struct pointer instead of name hash for reg/unreg
 tracepoints

Register/unregister tracepoint probes with struct tracepoint pointer
rather than tracepoint name.

This change, which vastly simplifies tracepoint.c, has been proposed by
Steven Rostedt. It also removes 8.8kB (mostly of text) to the vmlinux
size.

From this point on, the tracers need to pass a struct tracepoint pointer
to probe register/unregister. A probe can now only be connected to a
tracepoint that exists. Moreover, tracers are responsible for
unregistering the probe before the module containing its associated
tracepoint is unloaded.

   text    data     bss     dec     hex filename
10443444        4282528 10391552        25117524        17f4354 vmlinux.orig
10434930        4282848 10391552        25109330        17f2352 vmlinux

Link: http://lkml.kernel.org/r/1396992381-23785-2-git-send-email-mathieu.desnoyers@efficios.com

CC: Ingo Molnar <mingo@kernel.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Frank Ch. Eigler <fche@redhat.com>
CC: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
[ SDR - fixed return val in void func in tracepoint_module_going() ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h        |  22 +-
 include/linux/tracepoint.h          |  41 +--
 include/trace/ftrace.h              |   9 +-
 kernel/trace/trace_events.c         |  55 ++--
 kernel/trace/trace_events_trigger.c |   2 +-
 kernel/trace/trace_kprobe.c         |  21 +-
 kernel/trace/trace_output.c         |   2 +-
 kernel/trace/trace_uprobe.c         |  20 +-
 kernel/tracepoint.c                 | 511 ++++++++++++++++--------------------
 9 files changed, 331 insertions(+), 352 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index cdc30111d2f8..d16da3e53bc7 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -7,6 +7,7 @@
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
 #include <linux/perf_event.h>
+#include <linux/tracepoint.h>
 
 struct trace_array;
 struct trace_buffer;
@@ -232,6 +233,7 @@ enum {
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
+	TRACE_EVENT_FL_TRACEPOINT_BIT,
 };
 
 /*
@@ -244,6 +246,7 @@ enum {
  *                    (used for module unloading, if a module event is enabled,
  *                     it is best to clear the buffers that used it).
  *  USE_CALL_FILTER - For ftrace internal events, don't use file filter
+ *  TRACEPOINT    - Event is a tracepoint
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
@@ -252,12 +255,17 @@ enum {
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
+	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
 };
 
 struct ftrace_event_call {
 	struct list_head	list;
 	struct ftrace_event_class *class;
-	char			*name;
+	union {
+		char			*name;
+		/* Set TRACE_EVENT_FL_TRACEPOINT flag when using "tp" */
+		struct tracepoint	*tp;
+	};
 	struct trace_event	event;
 	const char		*print_fmt;
 	struct event_filter	*filter;
@@ -271,6 +279,7 @@ struct ftrace_event_call {
 	 *   bit 3:		ftrace internal event (do not enable)
 	 *   bit 4:		Event was enabled by module
 	 *   bit 5:		use call filter rather than file filter
+	 *   bit 6:		Event is a tracepoint
 	 */
 	int			flags; /* static flags of different events */
 
@@ -283,6 +292,15 @@ struct ftrace_event_call {
 #endif
 };
 
+static inline const char *
+ftrace_event_name(struct ftrace_event_call *call)
+{
+	if (call->flags & TRACE_EVENT_FL_TRACEPOINT)
+		return call->tp ? call->tp->name : NULL;
+	else
+		return call->name;
+}
+
 struct trace_array;
 struct ftrace_subsystem_dir;
 
@@ -353,7 +371,7 @@ struct ftrace_event_file {
 #define __TRACE_EVENT_FLAGS(name, value)				\
 	static int __init trace_init_flags_##name(void)			\
 	{								\
-		event_##name.flags = value;				\
+		event_##name.flags |= value;				\
 		return 0;						\
 	}								\
 	early_initcall(trace_init_flags_##name);
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 812b2553dfd8..08150e265761 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -6,7 +6,7 @@
  *
  * See Documentation/trace/tracepoints.txt.
  *
- * (C) Copyright 2008 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ * Copyright (C) 2008-2014 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
  *
  * Heavily inspired from the Linux Kernel Markers.
  *
@@ -21,6 +21,7 @@
 
 struct module;
 struct tracepoint;
+struct notifier_block;
 
 struct tracepoint_func {
 	void *func;
@@ -35,18 +36,13 @@ struct tracepoint {
 	struct tracepoint_func __rcu *funcs;
 };
 
-/*
- * Connect a probe to a tracepoint.
- * Internal API, should not be used directly.
- */
-extern int tracepoint_probe_register(const char *name, void *probe, void *data);
-
-/*
- * Disconnect a probe from a tracepoint.
- * Internal API, should not be used directly.
- */
 extern int
-tracepoint_probe_unregister(const char *name, void *probe, void *data);
+tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data);
+extern int
+tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data);
+extern void
+for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+		void *priv);
 
 #ifdef CONFIG_MODULES
 struct tp_module {
@@ -54,12 +50,25 @@ struct tp_module {
 	unsigned int num_tracepoints;
 	struct tracepoint * const *tracepoints_ptrs;
 };
+
 bool trace_module_has_bad_taint(struct module *mod);
+extern int register_tracepoint_module_notifier(struct notifier_block *nb);
+extern int unregister_tracepoint_module_notifier(struct notifier_block *nb);
 #else
 static inline bool trace_module_has_bad_taint(struct module *mod)
 {
 	return false;
 }
+static inline
+int register_tracepoint_module_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+static inline
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
 #endif /* CONFIG_MODULES */
 
 /*
@@ -160,14 +169,14 @@ static inline void tracepoint_synchronize_unregister(void)
 	static inline int						\
 	register_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
-		return tracepoint_probe_register(#name, (void *)probe,	\
-						 data);			\
+		return tracepoint_probe_register(&__tracepoint_##name,	\
+						(void *)probe, data);	\
 	}								\
 	static inline int						\
 	unregister_trace_##name(void (*probe)(data_proto), void *data)	\
 	{								\
-		return tracepoint_probe_unregister(#name, (void *)probe, \
-						   data);		\
+		return tracepoint_probe_unregister(&__tracepoint_##name,\
+						(void *)probe, data);	\
 	}								\
 	static inline void						\
 	check_trace_callback_type_##name(void (*cb)(data_proto))	\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 8765126b328c..9c44c11cd9bb 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -470,10 +470,11 @@ static inline notrace int ftrace_get_offsets_##call(			\
  * };
  *
  * static struct ftrace_event_call event_<call> = {
- *	.name			= "<call>",
+ *	.tp			= &__tracepoint_<call>,
  *	.class			= event_class_<template>,
  *	.event			= &ftrace_event_type_<call>,
  *	.print_fmt		= print_fmt_<call>,
+ *	.flags			= TRACE_EVENT_FL_TRACEPOINT,
  * };
  * // its only safe to use pointers when doing linker tricks to
  * // create an array.
@@ -605,10 +606,11 @@ static struct ftrace_event_class __used __refdata event_class_##call = { \
 #define DEFINE_EVENT(template, call, proto, args)			\
 									\
 static struct ftrace_event_call __used event_##call = {			\
-	.name			= #call,				\
+	.tp			= &__tracepoint_##call,			\
 	.class			= &event_class_##template,		\
 	.event.funcs		= &ftrace_event_type_funcs_##template,	\
 	.print_fmt		= print_fmt_##template,			\
+	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
 };									\
 static struct ftrace_event_call __used					\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
@@ -619,10 +621,11 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 static const char print_fmt_##call[] = print;				\
 									\
 static struct ftrace_event_call __used event_##call = {			\
-	.name			= #call,				\
+	.tp			= &__tracepoint_##call,			\
 	.class			= &event_class_##template,		\
 	.event.funcs		= &ftrace_event_type_funcs_##call,	\
 	.print_fmt		= print_fmt_##call,			\
+	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
 };									\
 static struct ftrace_event_call __used					\
 __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 83a4378dc5e0..3ddfd8f62c05 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -223,24 +223,25 @@ int ftrace_event_reg(struct ftrace_event_call *call,
 {
 	struct ftrace_event_file *file = data;
 
+	WARN_ON(!(call->flags & TRACE_EVENT_FL_TRACEPOINT));
 	switch (type) {
 	case TRACE_REG_REGISTER:
-		return tracepoint_probe_register(call->name,
+		return tracepoint_probe_register(call->tp,
 						 call->class->probe,
 						 file);
 	case TRACE_REG_UNREGISTER:
-		tracepoint_probe_unregister(call->name,
+		tracepoint_probe_unregister(call->tp,
 					    call->class->probe,
 					    file);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
 	case TRACE_REG_PERF_REGISTER:
-		return tracepoint_probe_register(call->name,
+		return tracepoint_probe_register(call->tp,
 						 call->class->perf_probe,
 						 call);
 	case TRACE_REG_PERF_UNREGISTER:
-		tracepoint_probe_unregister(call->name,
+		tracepoint_probe_unregister(call->tp,
 					    call->class->perf_probe,
 					    call);
 		return 0;
@@ -352,7 +353,7 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
-					"%s\n", call->name);
+					"%s\n", ftrace_event_name(call));
 				break;
 			}
 			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
@@ -481,27 +482,29 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
 {
 	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
+	const char *name;
 	int ret = -EINVAL;
 
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
+		name = ftrace_event_name(call);
 
-		if (!call->name || !call->class || !call->class->reg)
+		if (!name || !call->class || !call->class->reg)
 			continue;
 
 		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
 			continue;
 
 		if (match &&
-		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, name) != 0 &&
 		    strcmp(match, call->class->system) != 0)
 			continue;
 
 		if (sub && strcmp(sub, call->class->system) != 0)
 			continue;
 
-		if (event && strcmp(event, call->name) != 0)
+		if (event && strcmp(event, name) != 0)
 			continue;
 
 		ftrace_event_enable_disable(file, set);
@@ -699,7 +702,7 @@ static int t_show(struct seq_file *m, void *v)
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
-	seq_printf(m, "%s\n", call->name);
+	seq_printf(m, "%s\n", ftrace_event_name(call));
 
 	return 0;
 }
@@ -792,7 +795,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	mutex_lock(&event_mutex);
 	list_for_each_entry(file, &tr->events, list) {
 		call = file->event_call;
-		if (!call->name || !call->class || !call->class->reg)
+		if (!ftrace_event_name(call) || !call->class || !call->class->reg)
 			continue;
 
 		if (system && strcmp(call->class->system, system->name) != 0)
@@ -907,7 +910,7 @@ static int f_show(struct seq_file *m, void *v)
 
 	switch ((unsigned long)v) {
 	case FORMAT_HEADER:
-		seq_printf(m, "name: %s\n", call->name);
+		seq_printf(m, "name: %s\n", ftrace_event_name(call));
 		seq_printf(m, "ID: %d\n", call->event.type);
 		seq_printf(m, "format:\n");
 		return 0;
@@ -1527,6 +1530,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 	struct trace_array *tr = file->tr;
 	struct list_head *head;
 	struct dentry *d_events;
+	const char *name;
 	int ret;
 
 	/*
@@ -1540,10 +1544,11 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 	} else
 		d_events = parent;
 
-	file->dir = debugfs_create_dir(call->name, d_events);
+	name = ftrace_event_name(call);
+	file->dir = debugfs_create_dir(name, d_events);
 	if (!file->dir) {
 		pr_warning("Could not create debugfs '%s' directory\n",
-			   call->name);
+			   name);
 		return -1;
 	}
 
@@ -1567,7 +1572,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
 		ret = call->class->define_fields(call);
 		if (ret < 0) {
 			pr_warning("Could not initialize trace point"
-				   " events/%s\n", call->name);
+				   " events/%s\n", name);
 			return -1;
 		}
 	}
@@ -1631,15 +1636,17 @@ static void event_remove(struct ftrace_event_call *call)
 static int event_init(struct ftrace_event_call *call)
 {
 	int ret = 0;
+	const char *name;
 
-	if (WARN_ON(!call->name))
+	name = ftrace_event_name(call);
+	if (WARN_ON(!name))
 		return -EINVAL;
 
 	if (call->class->raw_init) {
 		ret = call->class->raw_init(call);
 		if (ret < 0 && ret != -ENOSYS)
 			pr_warn("Could not initialize trace events/%s\n",
-				call->name);
+				name);
 	}
 
 	return ret;
@@ -1885,7 +1892,7 @@ __trace_add_event_dirs(struct trace_array *tr)
 		ret = __trace_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warning("Could not create directory for event %s\n",
-				   call->name);
+				   ftrace_event_name(call));
 	}
 }
 
@@ -1894,18 +1901,20 @@ find_event_file(struct trace_array *tr, const char *system,  const char *event)
 {
 	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
+	const char *name;
 
 	list_for_each_entry(file, &tr->events, list) {
 
 		call = file->event_call;
+		name = ftrace_event_name(call);
 
-		if (!call->name || !call->class || !call->class->reg)
+		if (!name || !call->class || !call->class->reg)
 			continue;
 
 		if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
 			continue;
 
-		if (strcmp(event, call->name) == 0 &&
+		if (strcmp(event, name) == 0 &&
 		    strcmp(system, call->class->system) == 0)
 			return file;
 	}
@@ -1973,7 +1982,7 @@ event_enable_print(struct seq_file *m, unsigned long ip,
 	seq_printf(m, "%s:%s:%s",
 		   data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   data->file->event_call->class->system,
-		   data->file->event_call->name);
+		   ftrace_event_name(data->file->event_call));
 
 	if (data->count == -1)
 		seq_printf(m, ":unlimited\n");
@@ -2193,7 +2202,7 @@ __trace_early_add_event_dirs(struct trace_array *tr)
 		ret = event_create_dir(tr->event_dir, file);
 		if (ret < 0)
 			pr_warning("Could not create directory for event %s\n",
-				   file->event_call->name);
+				   ftrace_event_name(file->event_call));
 	}
 }
 
@@ -2217,7 +2226,7 @@ __trace_early_add_events(struct trace_array *tr)
 		ret = __trace_early_add_new_event(call, tr);
 		if (ret < 0)
 			pr_warning("Could not create early event %s\n",
-				   call->name);
+				   ftrace_event_name(call));
 	}
 }
 
@@ -2549,7 +2558,7 @@ static __init void event_trace_self_tests(void)
 			continue;
 #endif
 
-		pr_info("Testing event %s: ", call->name);
+		pr_info("Testing event %s: ", ftrace_event_name(call));
 
 		/*
 		 * If an event is already enabled, someone is using
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 8efbb69b04f0..925f537f07d1 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1095,7 +1095,7 @@ event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops,
 	seq_printf(m, "%s:%s:%s",
 		   enable_data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
 		   enable_data->file->event_call->class->system,
-		   enable_data->file->event_call->name);
+		   ftrace_event_name(enable_data->file->event_call));
 
 	if (data->count == -1)
 		seq_puts(m, ":unlimited");
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index d021d21dd150..903ae28962be 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -341,7 +341,7 @@ static struct trace_kprobe *find_trace_kprobe(const char *event,
 	struct trace_kprobe *tk;
 
 	list_for_each_entry(tk, &probe_list, list)
-		if (strcmp(tk->tp.call.name, event) == 0 &&
+		if (strcmp(ftrace_event_name(&tk->tp.call), event) == 0 &&
 		    strcmp(tk->tp.call.class->system, group) == 0)
 			return tk;
 	return NULL;
@@ -516,7 +516,8 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
 	mutex_lock(&probe_lock);
 
 	/* Delete old (same name) event if exist */
-	old_tk = find_trace_kprobe(tk->tp.call.name, tk->tp.call.class->system);
+	old_tk = find_trace_kprobe(ftrace_event_name(&tk->tp.call),
+			tk->tp.call.class->system);
 	if (old_tk) {
 		ret = unregister_trace_kprobe(old_tk);
 		if (ret < 0)
@@ -564,7 +565,8 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
 			if (ret)
 				pr_warning("Failed to re-register probe %s on"
 					   "%s: %d\n",
-					   tk->tp.call.name, mod->name, ret);
+					   ftrace_event_name(&tk->tp.call),
+					   mod->name, ret);
 		}
 	}
 	mutex_unlock(&probe_lock);
@@ -818,7 +820,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	int i;
 
 	seq_printf(m, "%c", trace_kprobe_is_return(tk) ? 'r' : 'p');
-	seq_printf(m, ":%s/%s", tk->tp.call.class->system, tk->tp.call.name);
+	seq_printf(m, ":%s/%s", tk->tp.call.class->system,
+			ftrace_event_name(&tk->tp.call));
 
 	if (!tk->symbol)
 		seq_printf(m, " 0x%p", tk->rp.kp.addr);
@@ -876,7 +879,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_kprobe *tk = v;
 
-	seq_printf(m, "  %-44s %15lu %15lu\n", tk->tp.call.name, tk->nhit,
+	seq_printf(m, "  %-44s %15lu %15lu\n",
+		   ftrace_event_name(&tk->tp.call), tk->nhit,
 		   tk->rp.kp.nmissed);
 
 	return 0;
@@ -1011,7 +1015,7 @@ print_kprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	if (!trace_seq_printf(s, "%s: (", tp->call.name))
+	if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
 		goto partial;
 
 	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1047,7 +1051,7 @@ print_kretprobe_event(struct trace_iterator *iter, int flags,
 	field = (struct kretprobe_trace_entry_head *)iter->ent;
 	tp = container_of(event, struct trace_probe, call.event);
 
-	if (!trace_seq_printf(s, "%s: (", tp->call.name))
+	if (!trace_seq_printf(s, "%s: (", ftrace_event_name(&tp->call)))
 		goto partial;
 
 	if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
@@ -1286,7 +1290,8 @@ static int register_kprobe_event(struct trace_kprobe *tk)
 	call->data = tk;
 	ret = trace_add_event_call(call);
 	if (ret) {
-		pr_info("Failed to register kprobe event: %s\n", call->name);
+		pr_info("Failed to register kprobe event: %s\n",
+			ftrace_event_name(call));
 		kfree(call->print_fmt);
 		unregister_ftrace_event(&call->event);
 	}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ca0e79e2abaa..a436de18aa99 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -431,7 +431,7 @@ int ftrace_raw_output_prep(struct trace_iterator *iter,
 	}
 
 	trace_seq_init(p);
-	ret = trace_seq_printf(s, "%s: ", event->name);
+	ret = trace_seq_printf(s, "%s: ", ftrace_event_name(event));
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index e4473367e7a4..930e51462dc8 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -294,7 +294,7 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
 	struct trace_uprobe *tu;
 
 	list_for_each_entry(tu, &uprobe_list, list)
-		if (strcmp(tu->tp.call.name, event) == 0 &&
+		if (strcmp(ftrace_event_name(&tu->tp.call), event) == 0 &&
 		    strcmp(tu->tp.call.class->system, group) == 0)
 			return tu;
 
@@ -324,7 +324,8 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
 	mutex_lock(&uprobe_lock);
 
 	/* register as an event */
-	old_tu = find_probe_event(tu->tp.call.name, tu->tp.call.class->system);
+	old_tu = find_probe_event(ftrace_event_name(&tu->tp.call),
+			tu->tp.call.class->system);
 	if (old_tu) {
 		/* delete old event */
 		ret = unregister_trace_uprobe(old_tu);
@@ -599,7 +600,8 @@ static int probes_seq_show(struct seq_file *m, void *v)
 	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 
-	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, tu->tp.call.name);
+	seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
+			ftrace_event_name(&tu->tp.call));
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 
 	for (i = 0; i < tu->tp.nr_args; i++)
@@ -649,7 +651,8 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
 {
 	struct trace_uprobe *tu = v;
 
-	seq_printf(m, "  %s %-44s %15lu\n", tu->filename, tu->tp.call.name, tu->nhit);
+	seq_printf(m, "  %s %-44s %15lu\n", tu->filename,
+			ftrace_event_name(&tu->tp.call), tu->nhit);
 	return 0;
 }
 
@@ -844,12 +847,14 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
 	tu = container_of(event, struct trace_uprobe, tp.call.event);
 
 	if (is_ret_probe(tu)) {
-		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->tp.call.name,
+		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)",
+					ftrace_event_name(&tu->tp.call),
 					entry->vaddr[1], entry->vaddr[0]))
 			goto partial;
 		data = DATAOF_TRACE_ENTRY(entry, true);
 	} else {
-		if (!trace_seq_printf(s, "%s: (0x%lx)", tu->tp.call.name,
+		if (!trace_seq_printf(s, "%s: (0x%lx)",
+					ftrace_event_name(&tu->tp.call),
 					entry->vaddr[0]))
 			goto partial;
 		data = DATAOF_TRACE_ENTRY(entry, false);
@@ -1275,7 +1280,8 @@ static int register_uprobe_event(struct trace_uprobe *tu)
 	ret = trace_add_event_call(call);
 
 	if (ret) {
-		pr_info("Failed to register uprobe event: %s\n", call->name);
+		pr_info("Failed to register uprobe event: %s\n",
+			ftrace_event_name(call));
 		kfree(call->print_fmt);
 		unregister_ftrace_event(&call->event);
 	}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 50f8329c2042..01b3bd84daa1 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 Mathieu Desnoyers
+ * Copyright (C) 2008-2014 Mathieu Desnoyers
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -33,39 +33,27 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
 /* Set to 1 to enable tracepoint debug output */
 static const int tracepoint_debug;
 
+#ifdef CONFIG_MODULES
 /*
- * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * table, as well as the local module list.
+ * Tracepoint module list mutex protects the local module list.
  */
-static DEFINE_MUTEX(tracepoints_mutex);
+static DEFINE_MUTEX(tracepoint_module_list_mutex);
 
-#ifdef CONFIG_MODULES
-/* Local list of struct module */
+/* Local list of struct tp_module */
 static LIST_HEAD(tracepoint_module_list);
 #endif /* CONFIG_MODULES */
 
 /*
- * Tracepoint hash table, containing the active tracepoints.
- * Protected by tracepoints_mutex.
+ * tracepoints_mutex protects the builtin and module tracepoints.
+ * tracepoints_mutex nests inside tracepoint_module_list_mutex.
  */
-#define TRACEPOINT_HASH_BITS 6
-#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
-static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
+static DEFINE_MUTEX(tracepoints_mutex);
 
 /*
  * Note about RCU :
  * It is used to delay the free of multiple probes array until a quiescent
  * state is reached.
- * Tracepoint entries modifications are protected by the tracepoints_mutex.
  */
-struct tracepoint_entry {
-	struct hlist_node hlist;
-	struct tracepoint_func *funcs;
-	int refcount;	/* Number of times armed. 0 if disarmed. */
-	int enabled;	/* Tracepoint enabled */
-	char name[0];
-};
-
 struct tp_probes {
 	struct rcu_head rcu;
 	struct tracepoint_func probes[0];
@@ -92,34 +80,33 @@ static inline void release_probes(struct tracepoint_func *old)
 	}
 }
 
-static void debug_print_probes(struct tracepoint_entry *entry)
+static void debug_print_probes(struct tracepoint_func *funcs)
 {
 	int i;
 
-	if (!tracepoint_debug || !entry->funcs)
+	if (!tracepoint_debug || !funcs)
 		return;
 
-	for (i = 0; entry->funcs[i].func; i++)
-		printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
+	for (i = 0; funcs[i].func; i++)
+		printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
 }
 
-static struct tracepoint_func *
-tracepoint_entry_add_probe(struct tracepoint_entry *entry,
-			   void *probe, void *data)
+static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
+		struct tracepoint_func *tp_func)
 {
 	int nr_probes = 0;
 	struct tracepoint_func *old, *new;
 
-	if (WARN_ON(!probe))
+	if (WARN_ON(!tp_func->func))
 		return ERR_PTR(-EINVAL);
 
-	debug_print_probes(entry);
-	old = entry->funcs;
+	debug_print_probes(*funcs);
+	old = *funcs;
 	if (old) {
 		/* (N -> N+1), (N != 0, 1) probes */
 		for (nr_probes = 0; old[nr_probes].func; nr_probes++)
-			if (old[nr_probes].func == probe &&
-			    old[nr_probes].data == data)
+			if (old[nr_probes].func == tp_func->func &&
+			    old[nr_probes].data == tp_func->data)
 				return ERR_PTR(-EEXIST);
 	}
 	/* + 2 : one for new probe, one for NULL func */
@@ -128,33 +115,30 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
 		return ERR_PTR(-ENOMEM);
 	if (old)
 		memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
-	new[nr_probes].func = probe;
-	new[nr_probes].data = data;
+	new[nr_probes] = *tp_func;
 	new[nr_probes + 1].func = NULL;
-	entry->refcount = nr_probes + 1;
-	entry->funcs = new;
-	debug_print_probes(entry);
+	*funcs = new;
+	debug_print_probes(*funcs);
 	return old;
 }
 
-static void *
-tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
-			      void *probe, void *data)
+static void *func_remove(struct tracepoint_func **funcs,
+		struct tracepoint_func *tp_func)
 {
 	int nr_probes = 0, nr_del = 0, i;
 	struct tracepoint_func *old, *new;
 
-	old = entry->funcs;
+	old = *funcs;
 
 	if (!old)
 		return ERR_PTR(-ENOENT);
 
-	debug_print_probes(entry);
+	debug_print_probes(*funcs);
 	/* (N -> M), (N > 1, M >= 0) probes */
-	if (probe) {
+	if (tp_func->func) {
 		for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
-			if (old[nr_probes].func == probe &&
-			     old[nr_probes].data == data)
+			if (old[nr_probes].func == tp_func->func &&
+			     old[nr_probes].data == tp_func->data)
 				nr_del++;
 		}
 	}
@@ -165,9 +149,8 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
 	 */
 	if (nr_probes - nr_del == 0) {
 		/* N -> 0, (N > 1) */
-		entry->funcs = NULL;
-		entry->refcount = 0;
-		debug_print_probes(entry);
+		*funcs = NULL;
+		debug_print_probes(*funcs);
 		return old;
 	} else {
 		int j = 0;
@@ -177,91 +160,34 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
 		for (i = 0; old[i].func; i++)
-			if (old[i].func != probe || old[i].data != data)
+			if (old[i].func != tp_func->func
+					|| old[i].data != tp_func->data)
 				new[j++] = old[i];
 		new[nr_probes - nr_del].func = NULL;
-		entry->refcount = nr_probes - nr_del;
-		entry->funcs = new;
+		*funcs = new;
 	}
-	debug_print_probes(entry);
+	debug_print_probes(*funcs);
 	return old;
 }
 
 /*
- * Get tracepoint if the tracepoint is present in the tracepoint hash table.
- * Must be called with tracepoints_mutex held.
- * Returns NULL if not present.
+ * Add the probe function to a tracepoint.
  */
-static struct tracepoint_entry *get_tracepoint(const char *name)
+static int tracepoint_add_func(struct tracepoint *tp,
+		struct tracepoint_func *func)
 {
-	struct hlist_head *head;
-	struct tracepoint_entry *e;
-	u32 hash = jhash(name, strlen(name), 0);
-
-	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-	hlist_for_each_entry(e, head, hlist) {
-		if (!strcmp(name, e->name))
-			return e;
-	}
-	return NULL;
-}
+	struct tracepoint_func *old, *tp_funcs;
 
-/*
- * Add the tracepoint to the tracepoint hash table. Must be called with
- * tracepoints_mutex held.
- */
-static struct tracepoint_entry *add_tracepoint(const char *name)
-{
-	struct hlist_head *head;
-	struct tracepoint_entry *e;
-	size_t name_len = strlen(name) + 1;
-	u32 hash = jhash(name, name_len-1, 0);
-
-	head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
-	hlist_for_each_entry(e, head, hlist) {
-		if (!strcmp(name, e->name)) {
-			printk(KERN_NOTICE
-				"tracepoint %s busy\n", name);
-			return ERR_PTR(-EEXIST);	/* Already there */
-		}
-	}
-	/*
-	 * Using kmalloc here to allocate a variable length element. Could
-	 * cause some memory fragmentation if overused.
-	 */
-	e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
-	if (!e)
-		return ERR_PTR(-ENOMEM);
-	memcpy(&e->name[0], name, name_len);
-	e->funcs = NULL;
-	e->refcount = 0;
-	e->enabled = 0;
-	hlist_add_head(&e->hlist, head);
-	return e;
-}
+	if (tp->regfunc && !static_key_enabled(&tp->key))
+		tp->regfunc();
 
-/*
- * Remove the tracepoint from the tracepoint hash table. Must be called with
- * mutex_lock held.
- */
-static inline void remove_tracepoint(struct tracepoint_entry *e)
-{
-	hlist_del(&e->hlist);
-	kfree(e);
-}
-
-/*
- * Sets the probe callback corresponding to one tracepoint.
- */
-static void set_tracepoint(struct tracepoint_entry **entry,
-	struct tracepoint *elem, int active)
-{
-	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
-
-	if (elem->regfunc && !static_key_enabled(&elem->key) && active)
-		elem->regfunc();
-	else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
-		elem->unregfunc();
+	tp_funcs = tp->funcs;
+	old = func_add(&tp_funcs, func);
+	if (IS_ERR(old)) {
+		WARN_ON_ONCE(1);
+		return PTR_ERR(old);
+	}
+	release_probes(old);
 
 	/*
 	 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -270,199 +196,163 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 	 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
 	 * is used.
 	 */
-	rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-	if (active && !static_key_enabled(&elem->key))
-		static_key_slow_inc(&elem->key);
-	else if (!active && static_key_enabled(&elem->key))
-		static_key_slow_dec(&elem->key);
+	rcu_assign_pointer(tp->funcs, tp_funcs);
+	if (!static_key_enabled(&tp->key))
+		static_key_slow_inc(&tp->key);
+	return 0;
 }
 
 /*
- * Disable a tracepoint and its probe callback.
+ * Remove a probe function from a tracepoint.
  * Note: only waiting an RCU period after setting elem->call to the empty
  * function insures that the original callback is not used anymore. This insured
  * by preempt_disable around the call site.
  */
-static void disable_tracepoint(struct tracepoint *elem)
+static int tracepoint_remove_func(struct tracepoint *tp,
+		struct tracepoint_func *func)
 {
-	if (elem->unregfunc && static_key_enabled(&elem->key))
-		elem->unregfunc();
-
-	if (static_key_enabled(&elem->key))
-		static_key_slow_dec(&elem->key);
-	rcu_assign_pointer(elem->funcs, NULL);
-}
+	struct tracepoint_func *old, *tp_funcs;
 
-/**
- * tracepoint_update_probe_range - Update a probe range
- * @begin: beginning of the range
- * @end: end of the range
- *
- * Updates the probe callback corresponding to a range of tracepoints.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probe_range(struct tracepoint * const *begin,
-					  struct tracepoint * const *end)
-{
-	struct tracepoint * const *iter;
-	struct tracepoint_entry *mark_entry;
-
-	if (!begin)
-		return;
-
-	for (iter = begin; iter < end; iter++) {
-		mark_entry = get_tracepoint((*iter)->name);
-		if (mark_entry) {
-			set_tracepoint(&mark_entry, *iter,
-					!!mark_entry->refcount);
-			mark_entry->enabled = !!mark_entry->refcount;
-		} else {
-			disable_tracepoint(*iter);
-		}
+	tp_funcs = tp->funcs;
+	old = func_remove(&tp_funcs, func);
+	if (IS_ERR(old)) {
+		WARN_ON_ONCE(1);
+		return PTR_ERR(old);
 	}
-}
-
-#ifdef CONFIG_MODULES
-void module_update_tracepoints(void)
-{
-	struct tp_module *tp_mod;
-
-	list_for_each_entry(tp_mod, &tracepoint_module_list, list)
-		tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
-			tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
-}
-#else /* CONFIG_MODULES */
-void module_update_tracepoints(void)
-{
-}
-#endif /* CONFIG_MODULES */
+	release_probes(old);
 
+	if (!tp_funcs) {
+		/* Removed last function */
+		if (tp->unregfunc && static_key_enabled(&tp->key))
+			tp->unregfunc();
 
-/*
- * Update probes, removing the faulty probes.
- * Called with tracepoints_mutex held.
- */
-static void tracepoint_update_probes(void)
-{
-	/* Core kernel tracepoints */
-	tracepoint_update_probe_range(__start___tracepoints_ptrs,
-		__stop___tracepoints_ptrs);
-	/* tracepoints in modules. */
-	module_update_tracepoints();
-}
-
-static struct tracepoint_func *
-tracepoint_add_probe(const char *name, void *probe, void *data)
-{
-	struct tracepoint_entry *entry;
-	struct tracepoint_func *old;
-
-	entry = get_tracepoint(name);
-	if (!entry) {
-		entry = add_tracepoint(name);
-		if (IS_ERR(entry))
-			return (struct tracepoint_func *)entry;
+		if (static_key_enabled(&tp->key))
+			static_key_slow_dec(&tp->key);
 	}
-	old = tracepoint_entry_add_probe(entry, probe, data);
-	if (IS_ERR(old) && !entry->refcount)
-		remove_tracepoint(entry);
-	return old;
+	rcu_assign_pointer(tp->funcs, tp_funcs);
+	return 0;
 }
 
 /**
  * tracepoint_probe_register -  Connect a probe to a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
  * @probe: probe handler
- * @data: probe private data
- *
- * Returns:
- * - 0 if the probe was successfully registered, and tracepoint
- *   callsites are currently loaded for that probe,
- * - -ENODEV if the probe was successfully registered, but no tracepoint
- *   callsite is currently loaded for that probe,
- * - other negative error value on error.
- *
- * When tracepoint_probe_register() returns either 0 or -ENODEV,
- * parameters @name, @probe, and @data may be used by the tracepoint
- * infrastructure until the probe is unregistered.
  *
- * The probe address must at least be aligned on the architecture pointer size.
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
  */
-int tracepoint_probe_register(const char *name, void *probe, void *data)
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
 {
-	struct tracepoint_func *old;
-	struct tracepoint_entry *entry;
-	int ret = 0;
+	struct tracepoint_func tp_func;
+	int ret;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_add_probe(name, probe, data);
-	if (IS_ERR(old)) {
-		mutex_unlock(&tracepoints_mutex);
-		return PTR_ERR(old);
-	}
-	tracepoint_update_probes();		/* may update entry */
-	entry = get_tracepoint(name);
-	/* Make sure the entry was enabled */
-	if (!entry || !entry->enabled)
-		ret = -ENODEV;
+	tp_func.func = probe;
+	tp_func.data = data;
+	ret = tracepoint_add_func(tp, &tp_func);
 	mutex_unlock(&tracepoints_mutex);
-	release_probes(old);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_register);
 
-static struct tracepoint_func *
-tracepoint_remove_probe(const char *name, void *probe, void *data)
-{
-	struct tracepoint_entry *entry;
-	struct tracepoint_func *old;
-
-	entry = get_tracepoint(name);
-	if (!entry)
-		return ERR_PTR(-ENOENT);
-	old = tracepoint_entry_remove_probe(entry, probe, data);
-	if (IS_ERR(old))
-		return old;
-	if (!entry->refcount)
-		remove_tracepoint(entry);
-	return old;
-}
-
 /**
  * tracepoint_probe_unregister -  Disconnect a probe from a tracepoint
- * @name: tracepoint name
+ * @tp: tracepoint
  * @probe: probe function pointer
- * @data: probe private data
  *
- * We do not need to call a synchronize_sched to make sure the probes have
- * finished running before doing a module unload, because the module unload
- * itself uses stop_machine(), which insures that every preempt disabled section
- * have finished.
+ * Returns 0 if ok, error value on error.
  */
-int tracepoint_probe_unregister(const char *name, void *probe, void *data)
+int tracepoint_probe_unregister(struct tracepoint *tp, void *probe, void *data)
 {
-	struct tracepoint_func *old;
+	struct tracepoint_func tp_func;
+	int ret;
 
 	mutex_lock(&tracepoints_mutex);
-	old = tracepoint_remove_probe(name, probe, data);
-	if (IS_ERR(old)) {
-		mutex_unlock(&tracepoints_mutex);
-		return PTR_ERR(old);
-	}
-	tracepoint_update_probes();		/* may update entry */
+	tp_func.func = probe;
+	tp_func.data = data;
+	ret = tracepoint_remove_func(tp, &tp_func);
 	mutex_unlock(&tracepoints_mutex);
-	release_probes(old);
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
 
-
 #ifdef CONFIG_MODULES
 bool trace_module_has_bad_taint(struct module *mod)
 {
 	return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
 }
 
+static BLOCKING_NOTIFIER_HEAD(tracepoint_notify_list);
+
+/**
+ * register_tracepoint_notifier - register tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * Notifiers registered with this function are called on module
+ * coming/going with the tracepoint_module_list_mutex held.
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int register_tracepoint_module_notifier(struct notifier_block *nb)
+{
+	struct tp_module *tp_mod;
+	int ret;
+
+	mutex_lock(&tracepoint_module_list_mutex);
+	ret = blocking_notifier_chain_register(&tracepoint_notify_list, nb);
+	if (ret)
+		goto end;
+	list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+		(void) nb->notifier_call(nb, MODULE_STATE_COMING, tp_mod);
+end:
+	mutex_unlock(&tracepoint_module_list_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(register_tracepoint_module_notifier);
+
+/**
+ * unregister_tracepoint_notifier - unregister tracepoint coming/going notifier
+ * @nb: notifier block
+ *
+ * The notifier block callback should expect a "struct tp_module" data
+ * pointer.
+ */
+int unregister_tracepoint_module_notifier(struct notifier_block *nb)
+{
+	struct tp_module *tp_mod;
+	int ret;
+
+	mutex_lock(&tracepoint_module_list_mutex);
+	ret = blocking_notifier_chain_unregister(&tracepoint_notify_list, nb);
+	if (ret)
+		goto end;
+	list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+		(void) nb->notifier_call(nb, MODULE_STATE_GOING, tp_mod);
+end:
+	mutex_unlock(&tracepoint_module_list_mutex);
+	return ret;
+
+}
+EXPORT_SYMBOL_GPL(unregister_tracepoint_module_notifier);
+
+/*
+ * Ensure the tracer unregistered the module's probes before the module
+ * teardown is performed. Prevents leaks of probe and data pointers.
+ */
+static void tp_module_going_check_quiescent(struct tracepoint * const *begin,
+		struct tracepoint * const *end)
+{
+	struct tracepoint * const *iter;
+
+	if (!begin)
+		return;
+	for (iter = begin; iter < end; iter++)
+		WARN_ON_ONCE((*iter)->funcs);
+}
+
 static int tracepoint_module_coming(struct module *mod)
 {
 	struct tp_module *tp_mod;
@@ -478,7 +368,7 @@ static int tracepoint_module_coming(struct module *mod)
 	 */
 	if (trace_module_has_bad_taint(mod))
 		return 0;
-	mutex_lock(&tracepoints_mutex);
+	mutex_lock(&tracepoint_module_list_mutex);
 	tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
 	if (!tp_mod) {
 		ret = -ENOMEM;
@@ -487,27 +377,33 @@ static int tracepoint_module_coming(struct module *mod)
 	tp_mod->num_tracepoints = mod->num_tracepoints;
 	tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
 	list_add_tail(&tp_mod->list, &tracepoint_module_list);
-	tracepoint_update_probe_range(mod->tracepoints_ptrs,
-		mod->tracepoints_ptrs + mod->num_tracepoints);
+	blocking_notifier_call_chain(&tracepoint_notify_list,
+			MODULE_STATE_COMING, tp_mod);
 end:
-	mutex_unlock(&tracepoints_mutex);
+	mutex_unlock(&tracepoint_module_list_mutex);
 	return ret;
 }
 
-static int tracepoint_module_going(struct module *mod)
+static void tracepoint_module_going(struct module *mod)
 {
-	struct tp_module *pos;
+	struct tp_module *tp_mod;
 
 	if (!mod->num_tracepoints)
-		return 0;
+		return;
 
-	mutex_lock(&tracepoints_mutex);
-	tracepoint_update_probe_range(mod->tracepoints_ptrs,
-		mod->tracepoints_ptrs + mod->num_tracepoints);
-	list_for_each_entry(pos, &tracepoint_module_list, list) {
-		if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
-			list_del(&pos->list);
-			kfree(pos);
+	mutex_lock(&tracepoint_module_list_mutex);
+	list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
+		if (tp_mod->tracepoints_ptrs == mod->tracepoints_ptrs) {
+			blocking_notifier_call_chain(&tracepoint_notify_list,
+					MODULE_STATE_GOING, tp_mod);
+			list_del(&tp_mod->list);
+			kfree(tp_mod);
+			/*
+			 * Called the going notifier before checking for
+			 * quiescence.
+			 */
+			tp_module_going_check_quiescent(mod->tracepoints_ptrs,
+				mod->tracepoints_ptrs + mod->num_tracepoints);
 			break;
 		}
 	}
@@ -517,12 +413,11 @@ static int tracepoint_module_going(struct module *mod)
 	 * flag on "going", in case a module taints the kernel only after being
 	 * loaded.
 	 */
-	mutex_unlock(&tracepoints_mutex);
-	return 0;
+	mutex_unlock(&tracepoint_module_list_mutex);
 }
 
-int tracepoint_module_notify(struct notifier_block *self,
-			     unsigned long val, void *data)
+static int tracepoint_module_notify(struct notifier_block *self,
+		unsigned long val, void *data)
 {
 	struct module *mod = data;
 	int ret = 0;
@@ -534,24 +429,58 @@ int tracepoint_module_notify(struct notifier_block *self,
 	case MODULE_STATE_LIVE:
 		break;
 	case MODULE_STATE_GOING:
-		ret = tracepoint_module_going(mod);
+		tracepoint_module_going(mod);
+		break;
+	case MODULE_STATE_UNFORMED:
 		break;
 	}
 	return ret;
 }
 
-struct notifier_block tracepoint_module_nb = {
+static struct notifier_block tracepoint_module_nb = {
 	.notifier_call = tracepoint_module_notify,
 	.priority = 0,
 };
 
-static int init_tracepoints(void)
+static __init int init_tracepoints(void)
 {
-	return register_module_notifier(&tracepoint_module_nb);
+	int ret;
+
+	ret = register_module_notifier(&tracepoint_module_nb);
+	if (ret) {
+		pr_warning("Failed to register tracepoint module enter notifier\n");
+	}
+	return ret;
 }
 __initcall(init_tracepoints);
 #endif /* CONFIG_MODULES */
 
+static void for_each_tracepoint_range(struct tracepoint * const *begin,
+		struct tracepoint * const *end,
+		void (*fct)(struct tracepoint *tp, void *priv),
+		void *priv)
+{
+	struct tracepoint * const *iter;
+
+	if (!begin)
+		return;
+	for (iter = begin; iter < end; iter++)
+		fct(*iter, priv);
+}
+
+/**
+ * for_each_kernel_tracepoint - iteration on all kernel tracepoints
+ * @fct: callback
+ * @priv: private data
+ */
+void for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
+		void *priv)
+{
+	for_each_tracepoint_range(__start___tracepoints_ptrs,
+		__stop___tracepoints_ptrs, fct, priv);
+}
+EXPORT_SYMBOL_GPL(for_each_kernel_tracepoint);
+
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 
 /* NB: reg/unreg are called while guarded with the tracepoints_mutex */
-- 
cgit v1.3-14-g43fede


From eb7d035c59431bb12e1aa6e69ddd3940352faddb Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 8 Apr 2014 20:09:40 -0400
Subject: tracepoint: Simplify tracepoint module search

Instead of copying the num_tracepoints and tracepoints_ptrs from
the module structure to the tp_mod structure, which only uses it to
find the module associated to tracepoints of modules that are coming
and going, simply copy the pointer to the module struct to the tracepoint
tp_module structure.

Also removed un-needed brackets around an if statement.

Link: http://lkml.kernel.org/r/20140408201705.4dad2c4a@gandalf.local.home

Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h | 3 +--
 kernel/tracepoint.c        | 9 ++++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 08150e265761..69a298b07357 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -47,8 +47,7 @@ for_each_kernel_tracepoint(void (*fct)(struct tracepoint *tp, void *priv),
 #ifdef CONFIG_MODULES
 struct tp_module {
 	struct list_head list;
-	unsigned int num_tracepoints;
-	struct tracepoint * const *tracepoints_ptrs;
+	struct module *mod;
 };
 
 bool trace_module_has_bad_taint(struct module *mod);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 01b3bd84daa1..162be198a247 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -374,8 +374,7 @@ static int tracepoint_module_coming(struct module *mod)
 		ret = -ENOMEM;
 		goto end;
 	}
-	tp_mod->num_tracepoints = mod->num_tracepoints;
-	tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+	tp_mod->mod = mod;
 	list_add_tail(&tp_mod->list, &tracepoint_module_list);
 	blocking_notifier_call_chain(&tracepoint_notify_list,
 			MODULE_STATE_COMING, tp_mod);
@@ -393,7 +392,7 @@ static void tracepoint_module_going(struct module *mod)
 
 	mutex_lock(&tracepoint_module_list_mutex);
 	list_for_each_entry(tp_mod, &tracepoint_module_list, list) {
-		if (tp_mod->tracepoints_ptrs == mod->tracepoints_ptrs) {
+		if (tp_mod->mod == mod) {
 			blocking_notifier_call_chain(&tracepoint_notify_list,
 					MODULE_STATE_GOING, tp_mod);
 			list_del(&tp_mod->list);
@@ -447,9 +446,9 @@ static __init int init_tracepoints(void)
 	int ret;
 
 	ret = register_module_notifier(&tracepoint_module_nb);
-	if (ret) {
+	if (ret)
 		pr_warning("Failed to register tracepoint module enter notifier\n");
-	}
+
 	return ret;
 }
 __initcall(init_tracepoints);
-- 
cgit v1.3-14-g43fede


From b725dfea24b89de672c055b34b22398283a3f4bc Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 9 Apr 2014 09:24:43 -0400
Subject: tracepoint: Fix sparse warnings in tracepoint.c

Fix the following sparse warnings:

  CHECK   kernel/tracepoint.c
kernel/tracepoint.c:184:18: warning: incorrect type in assignment (different address spaces)
kernel/tracepoint.c:184:18:    expected struct tracepoint_func *tp_funcs
kernel/tracepoint.c:184:18:    got struct tracepoint_func [noderef] <asn:4>*funcs
kernel/tracepoint.c:216:18: warning: incorrect type in assignment (different address spaces)
kernel/tracepoint.c:216:18:    expected struct tracepoint_func *tp_funcs
kernel/tracepoint.c:216:18:    got struct tracepoint_func [noderef] <asn:4>*funcs
kernel/tracepoint.c:392:24: error: return expression in void function
  CC      kernel/tracepoint.o
kernel/tracepoint.c: In function tracepoint_module_going:
kernel/tracepoint.c:491:6: warning: symbol 'syscall_regfunc' was not declared. Should it be static?
kernel/tracepoint.c:508:6: warning: symbol 'syscall_unregfunc' was not declared. Should it be static?

Link: http://lkml.kernel.org/r/1397049883-28692-1-git-send-email-mathieu.desnoyers@efficios.com

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h      | 5 +++++
 include/trace/events/syscalls.h | 3 ---
 kernel/tracepoint.c             | 6 ++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69a298b07357..9d30ee469c2a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -80,6 +80,11 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
+#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
+extern void syscall_regfunc(void);
+extern void syscall_unregfunc(void);
+#endif /* CONFIG_HAVE_SYSCALL_TRACEPOINTS */
+
 #define PARAMS(args...) args
 
 #endif /* _LINUX_TRACEPOINT_H */
diff --git a/include/trace/events/syscalls.h b/include/trace/events/syscalls.h
index 5a4c04a75b3d..14e49c798135 100644
--- a/include/trace/events/syscalls.h
+++ b/include/trace/events/syscalls.h
@@ -13,9 +13,6 @@
 
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
 
-extern void syscall_regfunc(void);
-extern void syscall_unregfunc(void);
-
 TRACE_EVENT_FN(sys_enter,
 
 	TP_PROTO(struct pt_regs *regs, long id),
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 162be198a247..ca2cfe21bb8e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -181,7 +181,8 @@ static int tracepoint_add_func(struct tracepoint *tp,
 	if (tp->regfunc && !static_key_enabled(&tp->key))
 		tp->regfunc();
 
-	tp_funcs = tp->funcs;
+	tp_funcs = rcu_dereference_protected(tp->funcs,
+			lockdep_is_held(&tracepoints_mutex));
 	old = func_add(&tp_funcs, func);
 	if (IS_ERR(old)) {
 		WARN_ON_ONCE(1);
@@ -213,7 +214,8 @@ static int tracepoint_remove_func(struct tracepoint *tp,
 {
 	struct tracepoint_func *old, *tp_funcs;
 
-	tp_funcs = tp->funcs;
+	tp_funcs = rcu_dereference_protected(tp->funcs,
+			lockdep_is_held(&tracepoints_mutex));
 	old = func_remove(&tp_funcs, func);
 	if (IS_ERR(old)) {
 		WARN_ON_ONCE(1);
-- 
cgit v1.3-14-g43fede


From 69cd9eba38867a493a043bb13eb9b33cad5f1a9a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 8 Apr 2014 15:30:07 -0700
Subject: futex: avoid race between requeue and wake

Jan Stancek reported:
 "pthread_cond_broadcast/4-1.c testcase from openposix testsuite (LTP)
  occasionally fails, because some threads fail to wake up.

  Testcase creates 5 threads, which are all waiting on same condition.
  Main thread then calls pthread_cond_broadcast() without holding mutex,
  which calls:

      futex(uaddr1, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, uaddr2, ..)

  This immediately wakes up single thread A, which unlocks mutex and
  tries to wake up another thread:

      futex(uaddr2, FUTEX_WAKE_PRIVATE, 1)

  If thread A manages to call futex_wake() before any waiters are
  requeued for uaddr2, no other thread is woken up"

The ordering constraints for the hash bucket waiter counting are that
the waiter counts have to be incremented _before_ getting the spinlock
(because the spinlock acts as part of the memory barrier), but the
"requeue" operation didn't honor those rules, and nobody had even
thought about that case.

This fairly simple patch just increments the waiter count for the target
hash bucket (hb2) when requeing a futex before taking the locks.  It
then decrements them again after releasing the lock - the code that
actually moves the futex(es) between hash buckets will do the additional
required waiter count housekeeping.

Reported-and-tested-by: Jan Stancek <jstancek@redhat.com>
Acked-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org # 3.14
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 67dacaf93e56..6801b3751a95 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1452,6 +1452,7 @@ retry:
 	hb2 = hash_futex(&key2);
 
 retry_private:
+	hb_waiters_inc(hb2);
 	double_lock_hb(hb1, hb2);
 
 	if (likely(cmpval != NULL)) {
@@ -1461,6 +1462,7 @@ retry_private:
 
 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
+			hb_waiters_dec(hb2);
 
 			ret = get_user(curval, uaddr1);
 			if (ret)
@@ -1510,6 +1512,7 @@ retry_private:
 			break;
 		case -EFAULT:
 			double_unlock_hb(hb1, hb2);
+			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
 			put_futex_key(&key1);
 			ret = fault_in_user_writeable(uaddr2);
@@ -1519,6 +1522,7 @@ retry_private:
 		case -EAGAIN:
 			/* The owner was exiting, try again. */
 			double_unlock_hb(hb1, hb2);
+			hb_waiters_dec(hb2);
 			put_futex_key(&key2);
 			put_futex_key(&key1);
 			cond_resched();
@@ -1594,6 +1598,7 @@ retry_private:
 
 out_unlock:
 	double_unlock_hb(hb1, hb2);
+	hb_waiters_dec(hb2);
 
 	/*
 	 * drop_futex_key_refs() must be called outside the spinlocks. During
-- 
cgit v1.3-14-g43fede


From abb43f6998eb6466ea392d3757e673bbdb6ae171 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 9 Apr 2014 17:06:08 -0400
Subject: tracing: Fix anonymous unions in struct ftrace_event_call

gcc <= 4.5.x has significant limitations with respect to initialization
of anonymous unions within structures. They need to be surrounded by
brackets, _and_ they need to be initialized in the same order in which
they appear in the structure declaration.

Link: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676
Link: http://lkml.kernel.org/r/1397077568-3156-1-git-send-email-mathieu.desnoyers@efficios.com

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/syscalls.h    |  8 ++++++--
 include/trace/ftrace.h      | 12 +++++++++---
 kernel/trace/trace_export.c |  6 ++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1e67b7a5968c..af94c98087c4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -119,8 +119,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
 	  event_enter_##sname = {					\
-		.name                   = "sys_enter"#sname,		\
 		.class			= &event_class_syscall_enter,	\
+		{							\
+			.name                   = "sys_enter"#sname,	\
+		},							\
 		.event.funcs            = &enter_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
 		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
@@ -133,8 +135,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
 	  event_exit_##sname = {					\
-		.name                   = "sys_exit"#sname,		\
 		.class			= &event_class_syscall_exit,	\
+		{							\
+			.name                   = "sys_exit"#sname,	\
+		},							\
 		.event.funcs		= &exit_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
 		.flags                  = TRACE_EVENT_FL_CAP_ANY,	\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 9c44c11cd9bb..0a1a4f7caf09 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -470,8 +470,10 @@ static inline notrace int ftrace_get_offsets_##call(			\
  * };
  *
  * static struct ftrace_event_call event_<call> = {
- *	.tp			= &__tracepoint_<call>,
  *	.class			= event_class_<template>,
+ *	{
+ *		.tp			= &__tracepoint_<call>,
+ *	},
  *	.event			= &ftrace_event_type_<call>,
  *	.print_fmt		= print_fmt_<call>,
  *	.flags			= TRACE_EVENT_FL_TRACEPOINT,
@@ -606,8 +608,10 @@ static struct ftrace_event_class __used __refdata event_class_##call = { \
 #define DEFINE_EVENT(template, call, proto, args)			\
 									\
 static struct ftrace_event_call __used event_##call = {			\
-	.tp			= &__tracepoint_##call,			\
 	.class			= &event_class_##template,		\
+	{								\
+		.tp			= &__tracepoint_##call,		\
+	},								\
 	.event.funcs		= &ftrace_event_type_funcs_##template,	\
 	.print_fmt		= print_fmt_##template,			\
 	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
@@ -621,8 +625,10 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 static const char print_fmt_##call[] = print;				\
 									\
 static struct ftrace_event_call __used event_##call = {			\
-	.tp			= &__tracepoint_##call,			\
 	.class			= &event_class_##template,		\
+	{								\
+		.tp			= &__tracepoint_##call,		\
+	},								\
 	.event.funcs		= &ftrace_event_type_funcs_##call,	\
 	.print_fmt		= print_fmt_##call,			\
 	.flags			= TRACE_EVENT_FL_TRACEPOINT,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index ee0a5098ac43..d4ddde28a81a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -173,9 +173,11 @@ struct ftrace_event_class __refdata event_class_ftrace_##call = {	\
 };									\
 									\
 struct ftrace_event_call __used event_##call = {			\
-	.name			= #call,				\
-	.event.type		= etype,				\
 	.class			= &event_class_ftrace_##call,		\
+	{								\
+		.name			= #call,			\
+	},								\
+	.event.type		= etype,				\
 	.print_fmt		= print,				\
 	.flags			= TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \
 };									\
-- 
cgit v1.3-14-g43fede


From 17a280ea8111c66791c18c0353b7986aafcb24fe Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 10 Apr 2014 22:43:37 -0400
Subject: tracing: Add missing function triggers dump and cpudump to README

The debugfs tracing README file lists all the function triggers except for
dump and cpudump. These should be added too.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9be67c5e5b0f..e3e665685ee5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3611,6 +3611,8 @@ static const char readme_msg[] =
 #ifdef CONFIG_TRACER_SNAPSHOT
 	"\t\t      snapshot\n"
 #endif
+	"\t\t      dump\n"
+	"\t\t      cpudump\n"
 	"\t     example: echo do_fault:traceoff > set_ftrace_filter\n"
 	"\t              echo do_trap:traceoff:3 > set_ftrace_filter\n"
 	"\t     The first one will disable tracing every time do_fault is hit\n"
-- 
cgit v1.3-14-g43fede


From a786c06d9f2719203c00b3d97b21f9a96980d0b5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Apr 2014 12:01:03 -0400
Subject: missing bits of "splice: fix racy pipe->buffers uses"

that commit has fixed only the parts of that mess in fs/splice.c itself;
there had been more in several other ->splice_read() instances...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/relay.c       | 2 +-
 kernel/trace/trace.c | 4 ++--
 mm/shmem.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 98833f664fb6..7d38607649a3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1251,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
 	subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
 	pidx = (read_start / PAGE_SIZE) % subbuf_pages;
 	poff = read_start & ~PAGE_MASK;
-	nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
+	nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max);
 
 	for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
 		unsigned int this_len, this_end, private;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7511de35257f..27924caaa124 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4410,7 +4410,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	trace_access_lock(iter->cpu_file);
 
 	/* Fill as many pages as possible. */
-	for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+	for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) {
 		spd.pages[i] = alloc_page(GFP_KERNEL);
 		if (!spd.pages[i])
 			break;
@@ -5267,7 +5267,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 	trace_access_lock(iter->cpu_file);
 	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 
-	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
+	for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 17d3799d04bd..37400a148f29 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1613,7 +1613,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	loff = *ppos & ~PAGE_CACHE_MASK;
 	req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	nr_pages = min(req_pages, pipe->buffers);
+	nr_pages = min(req_pages, spd.nr_pages_max);
 
 	spd.nr_pages = find_get_pages_contig(mapping, index,
 						nr_pages, spd.pages);
-- 
cgit v1.3-14-g43fede


From d7e8af1afeffb03ab250b91cd70ba8c701f0f2b7 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 9 Apr 2014 11:55:07 -0700
Subject: futex: update documentation for ordering guarantees

Commits 11d4616bd07f ("futex: revert back to the explicit waiter
counting code") and 69cd9eba3886 ("futex: avoid race between requeue and
wake") changed some of the finer details of how we think about futexes.
One was a late fix and the other a consequence of overlooking the whole
requeuing logic.

The first change caused our documentation to be incorrect, and the
second made us aware that we need to explicitly add more details to it.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6801b3751a95..5f589279e462 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -70,7 +70,10 @@
 #include "locking/rtmutex_common.h"
 
 /*
- * Basic futex operation and ordering guarantees:
+ * READ this before attempting to hack on futexes!
+ *
+ * Basic futex operation and ordering guarantees
+ * =============================================
  *
  * The waiter reads the futex value in user space and calls
  * futex_wait(). This function computes the hash bucket and acquires
@@ -119,7 +122,7 @@
  * sys_futex(WAIT, futex, val);
  *   futex_wait(futex, val);
  *
- *   waiters++;
+ *   waiters++; (a)
  *   mb(); (A) <-- paired with -.
  *                              |
  *   lock(hash_bucket(futex));  |
@@ -135,14 +138,14 @@
  *     unlock(hash_bucket(futex));
  *     schedule();                         if (waiters)
  *                                           lock(hash_bucket(futex));
- *                                           wake_waiters(futex);
- *                                           unlock(hash_bucket(futex));
+ *   else                                    wake_waiters(futex);
+ *     waiters--; (b)                        unlock(hash_bucket(futex));
  *
- * Where (A) orders the waiters increment and the futex value read -- this
- * is guaranteed by the head counter in the hb spinlock; and where (B)
- * orders the write to futex and the waiters read -- this is done by the
- * barriers in get_futex_key_refs(), through either ihold or atomic_inc,
- * depending on the futex type.
+ * Where (A) orders the waiters increment and the futex value read through
+ * atomic operations (see hb_waiters_inc) and where (B) orders the write
+ * to futex and the waiters read -- this is done by the barriers in
+ * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
+ * futex type.
  *
  * This yields the following case (where X:=waiters, Y:=futex):
  *
@@ -155,6 +158,17 @@
  * Which guarantees that x==0 && y==0 is impossible; which translates back into
  * the guarantee that we cannot both miss the futex variable change and the
  * enqueue.
+ *
+ * Note that a new waiter is accounted for in (a) even when it is possible that
+ * the wait call can return error, in which case we backtrack from it in (b).
+ * Refer to the comment in queue_lock().
+ *
+ * Similarly, in order to account for waiters being requeued on another
+ * address we always increment the waiters for the destination bucket before
+ * acquiring the lock. It then decrements them again  after releasing it -
+ * the code that actually moves the futex(es) between hash buckets (requeue_futex)
+ * will do the additional required waiter count housekeeping. This is done for
+ * double_lock_hb() and double_unlock_hb(), respectively.
  */
 
 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-- 
cgit v1.3-14-g43fede


From 2eac7648321f4a08aa4078504d7727af0af7173b Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Mon, 14 Apr 2014 21:02:59 +0200
Subject: seccomp: fix populating a0-a5 syscall args in 32-bit x86 BPF

Linus reports that on 32-bit x86 Chromium throws the following seccomp
resp. audit log messages:

  audit: type=1326 audit(1397359304.356:28108): auid=500 uid=500
gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023
pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0
syscall=172 compat=0 ip=0xb2dd9852 code=0x30000

  audit: type=1326 audit(1397359304.356:28109): auid=500 uid=500
gid=500 ses=2 subj=unconfined_u:unconfined_r:chrome_sandbox_t:s0-s0:c0.c1023
pid=3677 comm="chrome" exe="/opt/google/chrome/chrome" sig=0 syscall=5
compat=0 ip=0xb2dd9852 code=0x50000

These audit messages are being triggered via audit_seccomp() through
__secure_computing() in seccomp mode (BPF) filter with seccomp return
codes 0x30000 (== SECCOMP_RET_TRAP) and 0x50000 (== SECCOMP_RET_ERRNO)
during filter runtime. Moreover, Linus reports that x86_64 Chromium
seems fine.

The underlying issue that explains this is that the implementation of
populate_seccomp_data() is wrong. Our seccomp data structure sd that
is being shared with user ABI is:

  struct seccomp_data {
    int nr;
    __u32 arch;
    __u64 instruction_pointer;
    __u64 args[6];
  };

Therefore, a simple cast to 'unsigned long *' for storing the value of
the syscall argument via syscall_get_arguments() is just wrong as on
32-bit x86 (or any other 32bit arch), it would result in storing a0-a5
at wrong offsets in args[] member, and thus i) could leak stack memory
to user space and ii) tampers with the logic of seccomp BPF programs
that read out and check for syscall arguments:

  syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);

Tested on 32-bit x86 with Google Chrome, unfortunately only via remote
test machine through slow ssh X forwarding, but it fixes the issue on
my side. So fix it up by storing args in type correct variables, gcc
is clever and optimizes the copy away in other cases, e.g. x86_64.

Fixes: bd4cf0ed331a ("net: filter: rework/optimize internal BPF interpreter's instruction set")
Reported-and-bisected-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/seccomp.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index d8d046c0726a..590c37925084 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -69,18 +69,17 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 {
 	struct task_struct *task = current;
 	struct pt_regs *regs = task_pt_regs(task);
+	unsigned long args[6];
 
 	sd->nr = syscall_get_nr(task, regs);
 	sd->arch = syscall_get_arch();
-
-	/* Unroll syscall_get_args to help gcc on arm. */
-	syscall_get_arguments(task, regs, 0, 1, (unsigned long *) &sd->args[0]);
-	syscall_get_arguments(task, regs, 1, 1, (unsigned long *) &sd->args[1]);
-	syscall_get_arguments(task, regs, 2, 1, (unsigned long *) &sd->args[2]);
-	syscall_get_arguments(task, regs, 3, 1, (unsigned long *) &sd->args[3]);
-	syscall_get_arguments(task, regs, 4, 1, (unsigned long *) &sd->args[4]);
-	syscall_get_arguments(task, regs, 5, 1, (unsigned long *) &sd->args[5]);
-
+	syscall_get_arguments(task, regs, 0, 6, args);
+	sd->args[0] = args[0];
+	sd->args[1] = args[1];
+	sd->args[2] = args[2];
+	sd->args[3] = args[3];
+	sd->args[4] = args[4];
+	sd->args[5] = args[5];
 	sd->instruction_pointer = KSTK_EIP(task);
 }
 
-- 
cgit v1.3-14-g43fede


From e79323bd87808fdfbc68ce6c5371bd224d9672ee Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 14 Apr 2014 16:58:55 -0400
Subject: user namespace: fix incorrect memory barriers

smp_read_barrier_depends() can be used if there is data dependency between
the readers - i.e. if the read operation after the barrier uses address
that was obtained from the read operation before the barrier.

In this file, there is only control dependency, no data dependecy, so the
use of smp_read_barrier_depends() is incorrect. The code could fail in the
following way:
* the cpu predicts that idx < entries is true and starts executing the
  body of the for loop
* the cpu fetches map->extent[0].first and map->extent[0].count
* the cpu fetches map->nr_extents
* the cpu verifies that idx < extents is true, so it commits the
  instructions in the body of the for loop

The problem is that in this scenario, the cpu read map->extent[0].first
and map->nr_extents in the wrong order. We need a full read memory barrier
to prevent it.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user_namespace.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 0d8f6023fd8d..bf71b4b2d632 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -152,7 +152,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -176,7 +176,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
@@ -199,7 +199,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
 
 	/* Find the matching extent */
 	extents = map->nr_extents;
-	smp_read_barrier_depends();
+	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
@@ -615,9 +615,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 * were written before the count of the extents.
 	 *
 	 * To achieve this smp_wmb() is used on guarantee the write
-	 * order and smp_read_barrier_depends() is guaranteed that we
-	 * don't have crazy architectures returning stale data.
-	 *
+	 * order and smp_rmb() is guaranteed that we don't have crazy
+	 * architectures returning stale data.
 	 */
 	mutex_lock(&id_map_mutex);
 
-- 
cgit v1.3-14-g43fede


From 0acf07d240a84069c4a6651e6030cf35d30c7159 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 16 Apr 2014 10:54:34 -0700
Subject: seccomp: fix memory leak on filter attach

This sets the correct error code when final filter memory is unavailable,
and frees the raw filter no matter what.

unreferenced object 0xffff8800d6ea4000 (size 512):
  comm "sshd", pid 278, jiffies 4294898315 (age 46.653s)
  hex dump (first 32 bytes):
    21 00 00 00 04 00 00 00 15 00 01 00 3e 00 00 c0  !...........>...
    06 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00  ........!.......
  backtrace:
    [<ffffffff8151414e>] kmemleak_alloc+0x4e/0xb0
    [<ffffffff811a3a40>] __kmalloc+0x280/0x320
    [<ffffffff8110842e>] prctl_set_seccomp+0x11e/0x3b0
    [<ffffffff8107bb6b>] SyS_prctl+0x3bb/0x4a0
    [<ffffffff8152ef2d>] system_call_fastpath+0x1a/0x1f
    [<ffffffffffffffff>] 0xffffffffffffffff

Reported-by: Masami Ichikawa <masami256@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Masami Ichikawa <masami256@gmail.com>
Acked-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/seccomp.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 590c37925084..b35c21503a36 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -255,6 +255,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 		goto free_prog;
 
 	/* Allocate a new seccomp_filter */
+	ret = -ENOMEM;
 	filter = kzalloc(sizeof(struct seccomp_filter) +
 			 sizeof(struct sock_filter_int) * new_len,
 			 GFP_KERNEL|__GFP_NOWARN);
@@ -264,6 +265,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
 	if (ret)
 		goto free_filter;
+	kfree(fp);
 
 	atomic_set(&filter->usage, 1);
 	filter->len = new_len;
-- 
cgit v1.3-14-g43fede