From 546a9d8519ed137b2804a3f5a3659003039dd49c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Jun 2014 14:57:10 -0700
Subject: rcu: Export debug_init_rcu_head() and and debug_init_rcu_head()

Currently, call_rcu() relies on implicit allocation and initialization
for the debug-objects handling of RCU callbacks.  If you hammer the
kernel hard enough with Sasha's modified version of trinity, you can end
up with the sl*b allocators recursing into themselves via this implicit
call_rcu() allocation.

This commit therefore exports the debug_init_rcu_head() and
debug_rcu_head_free() functions, which permits the allocators to allocated
and pre-initialize the debug-objects information, so that there no longer
any need for call_rcu() to do that initialization, which in turn prevents
the recursion into the memory allocators.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Looks-good-to: Christoph Lameter <cl@linux.com>
---
 include/linux/rcupdate.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 5a75d19aa661..13bbfbde41b9 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -358,9 +358,19 @@ void wait_rcu_gp(call_rcu_func_t crf);
  * initialization.
  */
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+void init_rcu_head(struct rcu_head *head);
+void destroy_rcu_head(struct rcu_head *head);
 void init_rcu_head_on_stack(struct rcu_head *head);
 void destroy_rcu_head_on_stack(struct rcu_head *head);
 #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void init_rcu_head(struct rcu_head *head)
+{
+}
+
+static inline void destroy_rcu_head(struct rcu_head *head)
+{
+}
+
 static inline void init_rcu_head_on_stack(struct rcu_head *head)
 {
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4a81e8328d3791a4f99bf5b436d050f6dc5ffea3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 20 Jun 2014 16:49:01 -0700
Subject: rcu: Reduce overhead of cond_resched() checks for RCU

Commit ac1bea85781e (Make cond_resched() report RCU quiescent states)
fixed a problem where a CPU looping in the kernel with but one runnable
task would give RCU CPU stall warnings, even if the in-kernel loop
contained cond_resched() calls.  Unfortunately, in so doing, it introduced
performance regressions in Anton Blanchard's will-it-scale "open1" test.
The problem appears to be not so much the increased cond_resched() path
length as an increase in the rate at which grace periods complete, which
increased per-update grace-period overhead.

This commit takes a different approach to fixing this bug, mainly by
moving the RCU-visible quiescent state from cond_resched() to
rcu_note_context_switch(), and by further reducing the check to a
simple non-zero test of a single per-CPU variable.  However, this
approach requires that the force-quiescent-state processing send
resched IPIs to the offending CPUs.  These will be sent only once
the grace period has reached an age specified by the boot/sysfs
parameter rcutree.jiffies_till_sched_qs, or once the grace period
reaches an age halfway to the point at which RCU CPU stall warnings
will be emitted, whichever comes first.

Reported-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
[ paulmck: Made rcu_momentary_dyntick_idle() as suggested by the
  ktest build robot.  Also fixed smp_mb() comment as noted by
  Oleg Nesterov. ]

Merge with e552592e (Reduce overhead of cond_resched() checks for RCU)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |   6 ++
 include/linux/rcupdate.h            |  36 ----------
 kernel/rcu/tree.c                   | 140 ++++++++++++++++++++++++++++--------
 kernel/rcu/tree.h                   |   6 +-
 kernel/rcu/tree_plugin.h            |   2 +-
 kernel/rcu/update.c                 |  18 -----
 kernel/sched/core.c                 |   7 +-
 7 files changed, 125 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6eaa9cdb7094..910c3829f81d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2785,6 +2785,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			leaf rcu_node structure.  Useful for very large
 			systems.
 
+	rcutree.jiffies_till_sched_qs= [KNL]
+			Set required age in jiffies for a
+			given grace period before RCU starts
+			soliciting quiescent-state help from
+			rcu_note_context_switch().
+
 	rcutree.jiffies_till_first_fqs= [KNL]
 			Set delay from grace-period initialization to
 			first attempt to force quiescent states.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 13bbfbde41b9..6a94cc8b1ca0 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -44,7 +44,6 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
-#include <linux/percpu.h>
 #include <asm/barrier.h>
 
 extern int rcu_expedited; /* for sysctl */
@@ -299,41 +298,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
 bool __rcu_is_watching(void);
 #endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
 
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-#define RCU_COND_RESCHED_LIM 256	/* ms vs. 100s of ms. */
-DECLARE_PER_CPU(int, rcu_cond_resched_count);
-void rcu_resched(void);
-
-/*
- * Is it time to report RCU quiescent states?
- *
- * Note unsynchronized access to rcu_cond_resched_count.  Yes, we might
- * increment some random CPU's count, and possibly also load the result from
- * yet another CPU's count.  We might even clobber some other CPU's attempt
- * to zero its counter.  This is all OK because the goal is not precision,
- * but rather reasonable amortization of rcu_note_context_switch() overhead
- * and extremely high probability of avoiding RCU CPU stall warnings.
- * Note that this function has to be preempted in just the wrong place,
- * many thousands of times in a row, for anything bad to happen.
- */
-static inline bool rcu_should_resched(void)
-{
-	return raw_cpu_inc_return(rcu_cond_resched_count) >=
-	       RCU_COND_RESCHED_LIM;
-}
-
-/*
- * Report quiscent states to RCU if it is time to do so.
- */
-static inline void rcu_cond_resched(void)
-{
-	if (unlikely(rcu_should_resched()))
-		rcu_resched();
-}
-
 /*
  * Infrastructure to implement the synchronize_() primitives in
  * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f1ba77363fbb..625d0b0cd75a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -206,6 +206,70 @@ void rcu_bh_qs(int cpu)
 	rdp->passed_quiesce = 1;
 }
 
+static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
+
+static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
+	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
+	.dynticks = ATOMIC_INIT(1),
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+	.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
+	.dynticks_idle = ATOMIC_INIT(1),
+#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+};
+
+/*
+ * Let the RCU core know that this CPU has gone through the scheduler,
+ * which is a quiescent state.  This is called when the need for a
+ * quiescent state is urgent, so we burn an atomic operation and full
+ * memory barriers to let the RCU core know about it, regardless of what
+ * this CPU might (or might not) do in the near future.
+ *
+ * We inform the RCU core by emulating a zero-duration dyntick-idle
+ * period, which we in turn do by incrementing the ->dynticks counter
+ * by two.
+ */
+static void rcu_momentary_dyntick_idle(void)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+	struct rcu_dynticks *rdtp;
+	int resched_mask;
+	struct rcu_state *rsp;
+
+	local_irq_save(flags);
+
+	/*
+	 * Yes, we can lose flag-setting operations.  This is OK, because
+	 * the flag will be set again after some delay.
+	 */
+	resched_mask = raw_cpu_read(rcu_sched_qs_mask);
+	raw_cpu_write(rcu_sched_qs_mask, 0);
+
+	/* Find the flavor that needs a quiescent state. */
+	for_each_rcu_flavor(rsp) {
+		rdp = raw_cpu_ptr(rsp->rda);
+		if (!(resched_mask & rsp->flavor_mask))
+			continue;
+		smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */
+		if (ACCESS_ONCE(rdp->mynode->completed) !=
+		    ACCESS_ONCE(rdp->cond_resched_completed))
+			continue;
+
+		/*
+		 * Pretend to be momentarily idle for the quiescent state.
+		 * This allows the grace-period kthread to record the
+		 * quiescent state, with no need for this CPU to do anything
+		 * further.
+		 */
+		rdtp = this_cpu_ptr(&rcu_dynticks);
+		smp_mb__before_atomic(); /* Earlier stuff before QS. */
+		atomic_add(2, &rdtp->dynticks);  /* QS. */
+		smp_mb__after_atomic(); /* Later stuff after QS. */
+		break;
+	}
+	local_irq_restore(flags);
+}
+
 /*
  * Note a context switch.  This is a quiescent state for RCU-sched,
  * and requires special handling for preemptible RCU.
@@ -216,19 +280,12 @@ void rcu_note_context_switch(int cpu)
 	trace_rcu_utilization(TPS("Start context switch"));
 	rcu_sched_qs(cpu);
 	rcu_preempt_note_context_switch(cpu);
+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
+		rcu_momentary_dyntick_idle();
 	trace_rcu_utilization(TPS("End context switch"));
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
-static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-	.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
-	.dynticks = ATOMIC_INIT(1),
-#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
-	.dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE,
-	.dynticks_idle = ATOMIC_INIT(1),
-#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-};
-
 static long blimit = 10;	/* Maximum callbacks per rcu_do_batch. */
 static long qhimark = 10000;	/* If this many pending, ignore blimit. */
 static long qlowmark = 100;	/* Once only this many pending, use blimit. */
@@ -243,6 +300,13 @@ static ulong jiffies_till_next_fqs = ULONG_MAX;
 module_param(jiffies_till_first_fqs, ulong, 0644);
 module_param(jiffies_till_next_fqs, ulong, 0644);
 
+/*
+ * How long the grace period must be before we start recruiting
+ * quiescent-state help from rcu_note_context_switch().
+ */
+static ulong jiffies_till_sched_qs = HZ / 20;
+module_param(jiffies_till_sched_qs, ulong, 0644);
+
 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
 				  struct rcu_data *rdp);
 static void force_qs_rnp(struct rcu_state *rsp,
@@ -853,6 +917,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 				    bool *isidle, unsigned long *maxj)
 {
 	unsigned int curr;
+	int *rcrmp;
 	unsigned int snap;
 
 	curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
@@ -893,27 +958,43 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
 	}
 
 	/*
-	 * There is a possibility that a CPU in adaptive-ticks state
-	 * might run in the kernel with the scheduling-clock tick disabled
-	 * for an extended time period.  Invoke rcu_kick_nohz_cpu() to
-	 * force the CPU to restart the scheduling-clock tick in this
-	 * CPU is in this state.
-	 */
-	rcu_kick_nohz_cpu(rdp->cpu);
-
-	/*
-	 * Alternatively, the CPU might be running in the kernel
-	 * for an extended period of time without a quiescent state.
-	 * Attempt to force the CPU through the scheduler to gain the
-	 * needed quiescent state, but only if the grace period has gone
-	 * on for an uncommonly long time.  If there are many stuck CPUs,
-	 * we will beat on the first one until it gets unstuck, then move
-	 * to the next.  Only do this for the primary flavor of RCU.
+	 * A CPU running for an extended time within the kernel can
+	 * delay RCU grace periods.  When the CPU is in NO_HZ_FULL mode,
+	 * even context-switching back and forth between a pair of
+	 * in-kernel CPU-bound tasks cannot advance grace periods.
+	 * So if the grace period is old enough, make the CPU pay attention.
+	 * Note that the unsynchronized assignments to the per-CPU
+	 * rcu_sched_qs_mask variable are safe.  Yes, setting of
+	 * bits can be lost, but they will be set again on the next
+	 * force-quiescent-state pass.  So lost bit sets do not result
+	 * in incorrect behavior, merely in a grace period lasting
+	 * a few jiffies longer than it might otherwise.  Because
+	 * there are at most four threads involved, and because the
+	 * updates are only once every few jiffies, the probability of
+	 * lossage (and thus of slight grace-period extension) is
+	 * quite low.
+	 *
+	 * Note that if the jiffies_till_sched_qs boot/sysfs parameter
+	 * is set too high, we override with half of the RCU CPU stall
+	 * warning delay.
 	 */
-	if (rdp->rsp == rcu_state_p &&
+	rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu);
+	if (ULONG_CMP_GE(jiffies,
+			 rdp->rsp->gp_start + jiffies_till_sched_qs) ||
 	    ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
-		rdp->rsp->jiffies_resched += 5;
-		resched_cpu(rdp->cpu);
+		if (!(ACCESS_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) {
+			ACCESS_ONCE(rdp->cond_resched_completed) =
+				ACCESS_ONCE(rdp->mynode->completed);
+			smp_mb(); /* ->cond_resched_completed before *rcrmp. */
+			ACCESS_ONCE(*rcrmp) =
+				ACCESS_ONCE(*rcrmp) + rdp->rsp->flavor_mask;
+			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+			rdp->rsp->jiffies_resched += 5; /* Enable beating. */
+		} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
+			/* Time to beat on that CPU again! */
+			resched_cpu(rdp->cpu);  /* Force CPU into scheduler. */
+			rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+		}
 	}
 
 	return 0;
@@ -3491,6 +3572,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 			       "rcu_node_fqs_1",
 			       "rcu_node_fqs_2",
 			       "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
+	static u8 fl_mask = 0x1;
 	int cpustride = 1;
 	int i;
 	int j;
@@ -3509,6 +3591,8 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 	for (i = 1; i < rcu_num_lvls; i++)
 		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
 	rcu_init_levelspread(rsp);
+	rsp->flavor_mask = fl_mask;
+	fl_mask <<= 1;
 
 	/* Initialize the elements themselves, starting from the leaves. */
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bf2c1e669691..0f69a79c5b7d 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -307,6 +307,9 @@ struct rcu_data {
 	/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
 	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long cond_resched_completed;
+					/* Grace period that needs help */
+					/*  from cond_resched(). */
 
 	/* 5) __rcu_pending() statistics. */
 	unsigned long n_rcu_pending;	/* rcu_pending() calls since boot. */
@@ -392,6 +395,7 @@ struct rcu_state {
 	struct rcu_node *level[RCU_NUM_LVLS];	/* Hierarchy levels. */
 	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
 	u8 levelspread[RCU_NUM_LVLS];		/* kids/node in each level. */
+	u8 flavor_mask;				/* bit in flavor mask. */
 	struct rcu_data __percpu *rda;		/* pointer of percu rcu_data. */
 	void (*call)(struct rcu_head *head,	/* call_rcu() flavor. */
 		     void (*func)(struct rcu_head *head));
@@ -563,7 +567,7 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
 static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
 static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
 static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void rcu_kick_nohz_cpu(int cpu);
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
 static bool init_nocb_callback_list(struct rcu_data *rdp);
 static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
 static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index cbc2c45265e2..02ac0fb186b8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2404,7 +2404,7 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
  * if an adaptive-ticks CPU is failing to respond to the current grace
  * period and has not be idle from an RCU perspective, kick it.
  */
-static void rcu_kick_nohz_cpu(int cpu)
+static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
 {
 #ifdef CONFIG_NO_HZ_FULL
 	if (tick_nohz_full_cpu(cpu))
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 0fb691e63ce6..bc7883570530 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -350,21 +350,3 @@ static int __init check_cpu_stall_init(void)
 early_initcall(check_cpu_stall_init);
 
 #endif /* #ifdef CONFIG_RCU_STALL_COMMON */
-
-/*
- * Hooks for cond_resched() and friends to avoid RCU CPU stall warnings.
- */
-
-DEFINE_PER_CPU(int, rcu_cond_resched_count);
-
-/*
- * Report a set of RCU quiescent states, for use by cond_resched()
- * and friends.  Out of line due to being called infrequently.
- */
-void rcu_resched(void)
-{
-	preempt_disable();
-	__this_cpu_write(rcu_cond_resched_count, 0);
-	rcu_note_context_switch(smp_processor_id());
-	preempt_enable();
-}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b494fe..bc1638b33449 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4147,7 +4147,6 @@ static void __cond_resched(void)
 
 int __sched _cond_resched(void)
 {
-	rcu_cond_resched();
 	if (should_resched()) {
 		__cond_resched();
 		return 1;
@@ -4166,18 +4165,15 @@ EXPORT_SYMBOL(_cond_resched);
  */
 int __cond_resched_lock(spinlock_t *lock)
 {
-	bool need_rcu_resched = rcu_should_resched();
 	int resched = should_resched();
 	int ret = 0;
 
 	lockdep_assert_held(lock);
 
-	if (spin_needbreak(lock) || resched || need_rcu_resched) {
+	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
 			__cond_resched();
-		else if (unlikely(need_rcu_resched))
-			rcu_resched();
 		else
 			cpu_relax();
 		ret = 1;
@@ -4191,7 +4187,6 @@ int __sched __cond_resched_softirq(void)
 {
 	BUG_ON(!in_softirq());
 
-	rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
 	if (should_resched()) {
 		local_bh_enable();
 		__cond_resched();
-- 
cgit v1.2.3-59-g8ed1b


From b6220ad66bcd4a50737eb3c08e9466aa44f3bc98 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 24 Jun 2014 18:05:29 -0700
Subject: sched: Fix compiler warnings

Commit 143e1e28cb (sched: Rework sched_domain topology definition)
introduced a number of functions with a return value of 'const int'.
gcc doesn't know what to do with that and, if the kernel is compiled
with W=1, complains with the following warnings whenever sched.h
is included.

  include/linux/sched.h:875:25: warning: type qualifiers ignored on function return type
  include/linux/sched.h:882:25: warning: type qualifiers ignored on function return type
  include/linux/sched.h:889:25: warning: type qualifiers ignored on function return type
  include/linux/sched.h:1002:21: warning: type qualifiers ignored on function return type

Commits fb2aa855 (sched, ARM: Create a dedicated scheduler topology table)
and 607b45e9a (sched, powerpc: Create a dedicated topology table) introduce
the same warning in the arm and powerpc code.

Drop 'const' from the function declarations to fix the problem.

The fix for all three patches has to be applied together to avoid
compilation failures for the affected architectures.

Acked-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1403658329-13196-1-git-send-email-linux@roeck-us.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/topology.c | 2 +-
 arch/powerpc/kernel/smp.c  | 2 +-
 include/linux/sched.h      | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 9d853189028b..e35d880f9773 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -275,7 +275,7 @@ void store_cpu_topology(unsigned int cpuid)
 		cpu_topology[cpuid].socket_id, mpidr);
 }
 
-static inline const int cpu_corepower_flags(void)
+static inline int cpu_corepower_flags(void)
 {
 	return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
 }
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 51a3ff78838a..1007fb802e6b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -747,7 +747,7 @@ int setup_profiling_timer(unsigned int multiplier)
 
 #ifdef CONFIG_SCHED_SMT
 /* cpumask of CPUs with asymetric SMT dependancy */
-static const int powerpc_smt_flags(void)
+static int powerpc_smt_flags(void)
 {
 	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 306f4f0c987a..0376b054a0d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -872,21 +872,21 @@ enum cpu_idle_type {
 #define SD_NUMA			0x4000	/* cross-node balancing */
 
 #ifdef CONFIG_SCHED_SMT
-static inline const int cpu_smt_flags(void)
+static inline int cpu_smt_flags(void)
 {
 	return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
 #ifdef CONFIG_SCHED_MC
-static inline const int cpu_core_flags(void)
+static inline int cpu_core_flags(void)
 {
 	return SD_SHARE_PKG_RESOURCES;
 }
 #endif
 
 #ifdef CONFIG_NUMA
-static inline const int cpu_numa_flags(void)
+static inline int cpu_numa_flags(void)
 {
 	return SD_NUMA;
 }
@@ -999,7 +999,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 bool cpus_share_cache(int this_cpu, int that_cpu);
 
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-typedef const int (*sched_domain_flags_f)(void);
+typedef int (*sched_domain_flags_f)(void);
 
 #define SDTL_OVERLAP	0x01
 
-- 
cgit v1.2.3-59-g8ed1b


From d9daa24720891a88bedb93928f57767da96e5c80 Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Sat, 28 Jun 2014 01:23:35 +0200
Subject: net: fix circular dependency in of_mdio code

Commit 86f6cf4127 (net: of_mdio: add of_mdiobus_link_phydev()) introduced a
circular dependency between libphy and of_mdio.

depmod: ERROR: <modroot>/kernel/drivers/net/phy/libphy.ko in
dependency cycle!
depmod: ERROR: <modroot>/kernel/drivers/of/of_mdio.ko in dependency cycle!

The problem is that of_mdio.c references &mdio_bus_type and libphy now
references of_mdiobus_link_phydev.

Fix this by not exporting of_mdiobus_link_phydev() from of_mdio.ko.
Make it a static function in mdio_bus.c instead.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Reported-by: Jeff Mahoney <jeffm@suse.com>
Fixes: 86f6cf4127 (net: of_mdio: add of_mdiobus_link_phydev())
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/mdio_bus.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/of/of_mdio.c       | 34 ----------------------------------
 include/linux/of_mdio.h    |  8 --------
 3 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 2e58aa54484c..4eaadcfcb0fe 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -187,6 +187,50 @@ struct mii_bus *of_mdio_find_bus(struct device_node *mdio_bus_np)
 	return d ? to_mii_bus(d) : NULL;
 }
 EXPORT_SYMBOL(of_mdio_find_bus);
+
+/* Walk the list of subnodes of a mdio bus and look for a node that matches the
+ * phy's address with its 'reg' property. If found, set the of_node pointer for
+ * the phy. This allows auto-probed pyh devices to be supplied with information
+ * passed in via DT.
+ */
+static void of_mdiobus_link_phydev(struct mii_bus *mdio,
+				   struct phy_device *phydev)
+{
+	struct device *dev = &phydev->dev;
+	struct device_node *child;
+
+	if (dev->of_node || !mdio->dev.of_node)
+		return;
+
+	for_each_available_child_of_node(mdio->dev.of_node, child) {
+		int addr;
+		int ret;
+
+		ret = of_property_read_u32(child, "reg", &addr);
+		if (ret < 0) {
+			dev_err(dev, "%s has invalid PHY address\n",
+				child->full_name);
+			continue;
+		}
+
+		/* A PHY must have a reg property in the range [0-31] */
+		if (addr >= PHY_MAX_ADDR) {
+			dev_err(dev, "%s PHY address %i is too large\n",
+				child->full_name, addr);
+			continue;
+		}
+
+		if (addr == phydev->addr) {
+			dev->of_node = child;
+			return;
+		}
+	}
+}
+#else /* !IS_ENABLED(CONFIG_OF_MDIO) */
+static inline void of_mdiobus_link_phydev(struct mii_bus *mdio,
+					  struct phy_device *phydev)
+{
+}
 #endif
 
 /**
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index a3bf2122a8d5..401b2453da45 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -182,40 +182,6 @@ int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 }
 EXPORT_SYMBOL(of_mdiobus_register);
 
-/**
- * of_mdiobus_link_phydev - Find a device node for a phy
- * @mdio: pointer to mii_bus structure
- * @phydev: phydev for which the of_node pointer should be set
- *
- * Walk the list of subnodes of a mdio bus and look for a node that matches the
- * phy's address with its 'reg' property. If found, set the of_node pointer for
- * the phy. This allows auto-probed pyh devices to be supplied with information
- * passed in via DT.
- */
-void of_mdiobus_link_phydev(struct mii_bus *mdio,
-			    struct phy_device *phydev)
-{
-	struct device *dev = &phydev->dev;
-	struct device_node *child;
-
-	if (dev->of_node || !mdio->dev.of_node)
-		return;
-
-	for_each_available_child_of_node(mdio->dev.of_node, child) {
-		int addr;
-
-		addr = of_mdio_parse_addr(&mdio->dev, child);
-		if (addr < 0)
-			continue;
-
-		if (addr == phydev->addr) {
-			dev->of_node = child;
-			return;
-		}
-	}
-}
-EXPORT_SYMBOL(of_mdiobus_link_phydev);
-
 /* Helper function for of_phy_find_device */
 static int of_phy_match(struct device *dev, void *phy_np)
 {
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index a70c9493d55a..d449018d0726 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -25,9 +25,6 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 
 extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 
-extern void of_mdiobus_link_phydev(struct mii_bus *mdio,
-				   struct phy_device *phydev);
-
 #else /* CONFIG_OF */
 static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
 {
@@ -63,11 +60,6 @@ static inline struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np)
 {
 	return NULL;
 }
-
-static inline void of_mdiobus_link_phydev(struct mii_bus *mdio,
-					  struct phy_device *phydev)
-{
-}
 #endif /* CONFIG_OF */
 
 #if defined(CONFIG_OF) && defined(CONFIG_FIXED_PHY)
-- 
cgit v1.2.3-59-g8ed1b


From 35f6f45368632f21bd27559c44dbb1cab51d8947 Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Sun, 29 Jun 2014 11:54:55 +0300
Subject: net/mlx4_en: Don't use irq_affinity_notifier to track changes in IRQ
 affinity map

IRQ affinity notifier can only have a single notifier - cpu_rmap
notifier. Can't use it to track changes in IRQ affinity map.
Detect IRQ affinity changes by comparing CPU to current IRQ affinity map
during NAPI poll thread.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Ben Hutchings <ben@decadent.org.uk>
Fixes: 2eacc23 ("net/mlx4_core: Enforce irq affinity changes immediatly")
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cq.c      |  2 -
 drivers/net/ethernet/mellanox/mlx4/en_cq.c   |  4 ++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   | 16 +++++--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c   |  6 ---
 drivers/net/ethernet/mellanox/mlx4/eq.c      | 69 ++++------------------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |  1 +
 include/linux/mlx4/device.h                  |  4 +-
 7 files changed, 28 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 80f725228f5b..56022d647837 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -294,8 +294,6 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
 	init_completion(&cq->free);
 
 	cq->irq = priv->eq_table.eq[cq->vector].irq;
-	cq->irq_affinity_change = false;
-
 	return 0;
 
 err_radix:
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index 4b2130760eed..1213cc71348c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -128,6 +128,10 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 					mlx4_warn(mdev, "Failed assigning an EQ to %s, falling back to legacy EQ's\n",
 						  name);
 				}
+
+				cq->irq_desc =
+					irq_to_desc(mlx4_eq_get_irq(mdev->dev,
+								    cq->vector));
 			}
 		} else {
 			cq->vector = (cq->ring + 1 + priv->port) %
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index d2d415732d99..96724170308a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -40,6 +40,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_vlan.h>
 #include <linux/vmalloc.h>
+#include <linux/irq.h>
 
 #include "mlx4_en.h"
 
@@ -896,16 +897,25 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
 
 	/* If we used up all the quota - we're probably not done yet... */
 	if (done == budget) {
+		int cpu_curr;
+		const struct cpumask *aff;
+
 		INC_PERF_COUNTER(priv->pstats.napi_quota);
-		if (unlikely(cq->mcq.irq_affinity_change)) {
-			cq->mcq.irq_affinity_change = false;
+
+		cpu_curr = smp_processor_id();
+		aff = irq_desc_get_irq_data(cq->irq_desc)->affinity;
+
+		if (unlikely(!cpumask_test_cpu(cpu_curr, aff))) {
+			/* Current cpu is not according to smp_irq_affinity -
+			 * probably affinity changed. need to stop this NAPI
+			 * poll, and restart it on the right CPU
+			 */
 			napi_complete(napi);
 			mlx4_en_arm_cq(priv, cq);
 			return 0;
 		}
 	} else {
 		/* Done for now */
-		cq->mcq.irq_affinity_change = false;
 		napi_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 8be7483f8236..ac3dead3792c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -474,15 +474,9 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
 	/* If we used up all the quota - we're probably not done yet... */
 	if (done < budget) {
 		/* Done for now */
-		cq->mcq.irq_affinity_change = false;
 		napi_complete(napi);
 		mlx4_en_arm_cq(priv, cq);
 		return done;
-	} else if (unlikely(cq->mcq.irq_affinity_change)) {
-		cq->mcq.irq_affinity_change = false;
-		napi_complete(napi);
-		mlx4_en_arm_cq(priv, cq);
-		return 0;
 	}
 	return budget;
 }
diff --git a/drivers/net/ethernet/mellanox/mlx4/eq.c b/drivers/net/ethernet/mellanox/mlx4/eq.c
index d954ec1eac17..2a004b347e1d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/eq.c
@@ -53,11 +53,6 @@ enum {
 	MLX4_EQ_ENTRY_SIZE	= 0x20
 };
 
-struct mlx4_irq_notify {
-	void *arg;
-	struct irq_affinity_notify notify;
-};
-
 #define MLX4_EQ_STATUS_OK	   ( 0 << 28)
 #define MLX4_EQ_STATUS_WRITE_FAIL  (10 << 28)
 #define MLX4_EQ_OWNER_SW	   ( 0 << 24)
@@ -1088,57 +1083,6 @@ static void mlx4_unmap_clr_int(struct mlx4_dev *dev)
 	iounmap(priv->clr_base);
 }
 
-static void mlx4_irq_notifier_notify(struct irq_affinity_notify *notify,
-				     const cpumask_t *mask)
-{
-	struct mlx4_irq_notify *n = container_of(notify,
-						 struct mlx4_irq_notify,
-						 notify);
-	struct mlx4_priv *priv = (struct mlx4_priv *)n->arg;
-	struct radix_tree_iter iter;
-	void **slot;
-
-	radix_tree_for_each_slot(slot, &priv->cq_table.tree, &iter, 0) {
-		struct mlx4_cq *cq = (struct mlx4_cq *)(*slot);
-
-		if (cq->irq == notify->irq)
-			cq->irq_affinity_change = true;
-	}
-}
-
-static void mlx4_release_irq_notifier(struct kref *ref)
-{
-	struct mlx4_irq_notify *n = container_of(ref, struct mlx4_irq_notify,
-						 notify.kref);
-	kfree(n);
-}
-
-static void mlx4_assign_irq_notifier(struct mlx4_priv *priv,
-				     struct mlx4_dev *dev, int irq)
-{
-	struct mlx4_irq_notify *irq_notifier = NULL;
-	int err = 0;
-
-	irq_notifier = kzalloc(sizeof(*irq_notifier), GFP_KERNEL);
-	if (!irq_notifier) {
-		mlx4_warn(dev, "Failed to allocate irq notifier. irq %d\n",
-			  irq);
-		return;
-	}
-
-	irq_notifier->notify.irq = irq;
-	irq_notifier->notify.notify = mlx4_irq_notifier_notify;
-	irq_notifier->notify.release = mlx4_release_irq_notifier;
-	irq_notifier->arg = priv;
-	err = irq_set_affinity_notifier(irq, &irq_notifier->notify);
-	if (err) {
-		kfree(irq_notifier);
-		irq_notifier = NULL;
-		mlx4_warn(dev, "Failed to set irq notifier. irq %d\n", irq);
-	}
-}
-
-
 int mlx4_alloc_eq_table(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1409,8 +1353,6 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 				continue;
 				/*we dont want to break here*/
 			}
-			mlx4_assign_irq_notifier(priv, dev,
-						 priv->eq_table.eq[vec].irq);
 
 			eq_set_ci(&priv->eq_table.eq[vec], 1);
 		}
@@ -1427,6 +1369,14 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 }
 EXPORT_SYMBOL(mlx4_assign_eq);
 
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	return priv->eq_table.eq[vec].irq;
+}
+EXPORT_SYMBOL(mlx4_eq_get_irq);
+
 void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -1438,9 +1388,6 @@ void mlx4_release_eq(struct mlx4_dev *dev, int vec)
 		  Belonging to a legacy EQ*/
 		mutex_lock(&priv->msix_ctl.pool_lock);
 		if (priv->msix_ctl.pool_bm & 1ULL << i) {
-			irq_set_affinity_notifier(
-				priv->eq_table.eq[vec].irq,
-				NULL);
 			free_irq(priv->eq_table.eq[vec].irq,
 				 &priv->eq_table.eq[vec]);
 			priv->msix_ctl.pool_bm &= ~(1ULL << i);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 0e15295bedd6..624e1939e9ee 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -343,6 +343,7 @@ struct mlx4_en_cq {
 #define CQ_USER_PEND (MLX4_EN_CQ_STATE_POLL | MLX4_EN_CQ_STATE_POLL_YIELD)
 	spinlock_t poll_lock; /* protects from LLS/napi conflicts */
 #endif  /* CONFIG_NET_RX_BUSY_POLL */
+	struct irq_desc *irq_desc;
 };
 
 struct mlx4_en_port_profile {
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b12f4bbd064c..35b51e7af886 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -578,8 +578,6 @@ struct mlx4_cq {
 	u32			cons_index;
 
 	u16                     irq;
-	bool                    irq_affinity_change;
-
 	__be32		       *set_ci_db;
 	__be32		       *arm_db;
 	int			arm_sn;
@@ -1167,6 +1165,8 @@ int mlx4_assign_eq(struct mlx4_dev *dev, char *name, struct cpu_rmap *rmap,
 		   int *vector);
 void mlx4_release_eq(struct mlx4_dev *dev, int vec);
 
+int mlx4_eq_get_irq(struct mlx4_dev *dev, int vec);
+
 int mlx4_get_phys_port_id(struct mlx4_dev *dev);
 int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port);
 int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port);
-- 
cgit v1.2.3-59-g8ed1b


From 046a619d8e9746fa4c0e29e8c6b78e16efc008a8 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:48 -0700
Subject: locking/spinlocks/mcs: Rename optimistic_spin_queue() to
 optimistic_spin_node()

Currently, the per-cpu nodes structure for the cancellable MCS spinlock is
named "optimistic_spin_queue". However, in a follow up patch in the series
we will be introducing a new structure that serves as the new "handle" for
the lock. It would make more sense if that structure is named
"optimistic_spin_queue". Additionally, since the current use of the
"optimistic_spin_queue" structure are  "nodes", it might be better if we
rename them to "node" anyway.

This preparatory patch renames all current "optimistic_spin_queue"
to "optimistic_spin_node".

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-2-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h         |  4 ++--
 include/linux/rwsem.h         |  4 ++--
 kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------
 kernel/locking/mcs_spinlock.h |  8 ++++----
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692dea18aa..885f3f56a77f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -56,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_queue	*osq;	/* Spinner MCS lock */
+	struct optimistic_spin_node	*osq;	/* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708146aa..ba3f108ddea1 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,7 +16,7 @@
 
 #include <linux/atomic.h>
 
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +33,7 @@ struct rw_semaphore {
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_queue *osq; /* spinner MCS lock */
+	struct optimistic_spin_node *osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e00669..e9866f70e828 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,18 +14,18 @@
  * called from interrupt context and we have preemption disabled while
  * spinning.
  */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
 
 /*
  * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
  * Can return NULL in case we were the last queued and we updated @lock instead.
  */
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
-	      struct optimistic_spin_queue *node,
-	      struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_node **lock,
+	      struct optimistic_spin_node *node,
+	      struct optimistic_spin_node *prev)
 {
-	struct optimistic_spin_queue *next = NULL;
+	struct optimistic_spin_node *next = NULL;
 
 	for (;;) {
 		if (*lock == node && cmpxchg(lock, node, prev) == node) {
@@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock,
 	return next;
 }
 
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_node **lock)
 {
-	struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
-	struct optimistic_spin_queue *prev, *next;
+	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_node *prev, *next;
 
 	node->locked = 0;
 	node->next = NULL;
@@ -149,10 +149,10 @@ unqueue:
 	return false;
 }
 
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_node **lock)
 {
-	struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
-	struct optimistic_spin_queue *next;
+	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+	struct optimistic_spin_node *next;
 
 	/*
 	 * Fast path for the uncontended case.
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4aca6b..c99dc0052f49 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
  * mutex_lock()/rwsem_down_{read,write}() etc.
  */
 
-struct optimistic_spin_queue {
-	struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+	struct optimistic_spin_node *next, *prev;
 	int locked; /* 1 if lock acquired */
 };
 
-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_node **lock);
+extern void osq_unlock(struct optimistic_spin_node **lock);
 
 #endif /* __LINUX_MCS_SPINLOCK_H */
-- 
cgit v1.2.3-59-g8ed1b


From 90631822c5d307b5410500806e8ac3e63928aa3e Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:49 -0700
Subject: locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce
 overhead

The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.

In this patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.

By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-3-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h         |  4 ++--
 include/linux/osq_lock.h      | 19 ++++++++++++++++++
 include/linux/rwsem.h         |  7 +++----
 kernel/locking/mcs_spinlock.c | 46 ++++++++++++++++++++++++++++++++++++-------
 kernel/locking/mcs_spinlock.h |  5 +++--
 kernel/locking/mutex.c        |  2 +-
 kernel/locking/rwsem-xadd.c   |  2 +-
 7 files changed, 68 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/osq_lock.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 885f3f56a77f..42aa9b9ecd5f 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
 #include <linux/lockdep.h>
 #include <linux/atomic.h>
 #include <asm/processor.h>
+#include <linux/osq_lock.h>
 
 /*
  * Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
  * - detects multi-task circular deadlocks and prints out all affected
  *   locks and tasks (and only those tasks)
  */
-struct optimistic_spin_node;
 struct mutex {
 	/* 1: unlocked, 0: locked, negative: locked, possible waiters */
 	atomic_t		count;
@@ -56,7 +56,7 @@ struct mutex {
 	struct task_struct	*owner;
 #endif
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	struct optimistic_spin_node	*osq;	/* Spinner MCS lock */
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 000000000000..b001682bf7cb
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+
+#define OSQ_UNLOCKED_VAL (0)
+
+struct optimistic_spin_queue {
+	/*
+	 * Stores an encoded value of the CPU # of the tail node in the queue.
+	 * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+	 */
+	atomic_t tail;
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index ba3f108ddea1..9fdcdd03507d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-
 #include <linux/atomic.h>
+#include <linux/osq_lock.h>
 
-struct optimistic_spin_node;
 struct rw_semaphore;
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_node *osq; /* spinner MCS lock */
+	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list),		\
 	  NULL, /* owner */				\
-	  NULL /* mcs lock */                           \
+	  { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */   \
 	  __RWSEM_DEP_MAP_INIT(name) }
 #else
 #define __RWSEM_INITIALIZER(name)			\
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index e9866f70e828..32fc16c0a545 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -16,19 +16,45 @@
  */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
 
+/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+	return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+	int cpu_nr = encoded_cpu_val - 1;
+
+	return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
 /*
  * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
  * Can return NULL in case we were the last queued and we updated @lock instead.
  */
 static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_node **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
 	      struct optimistic_spin_node *node,
 	      struct optimistic_spin_node *prev)
 {
 	struct optimistic_spin_node *next = NULL;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
+
+	/*
+	 * If there is a prev node in queue, then the 'old' value will be
+	 * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+	 * we're currently last in queue, then the queue will then become empty.
+	 */
+	old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
 
 	for (;;) {
-		if (*lock == node && cmpxchg(lock, node, prev) == node) {
+		if (atomic_read(&lock->tail) == curr &&
+		    atomic_cmpxchg(&lock->tail, curr, old) == curr) {
 			/*
 			 * We were the last queued, we moved @lock back. @prev
 			 * will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_node **lock,
 	return next;
 }
 
-bool osq_lock(struct optimistic_spin_node **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
 {
 	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
 	struct optimistic_spin_node *prev, *next;
+	int curr = encode_cpu(smp_processor_id());
+	int old;
 
 	node->locked = 0;
 	node->next = NULL;
+	node->cpu = curr;
 
-	node->prev = prev = xchg(lock, node);
-	if (likely(prev == NULL))
+	old = atomic_xchg(&lock->tail, curr);
+	if (old == OSQ_UNLOCKED_VAL)
 		return true;
 
+	prev = decode_cpu(old);
+	node->prev = prev;
 	ACCESS_ONCE(prev->next) = node;
 
 	/*
@@ -149,15 +180,16 @@ unqueue:
 	return false;
 }
 
-void osq_unlock(struct optimistic_spin_node **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
 {
 	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
 	struct optimistic_spin_node *next;
+	int curr = encode_cpu(smp_processor_id());
 
 	/*
 	 * Fast path for the uncontended case.
 	 */
-	if (likely(cmpxchg(lock, node, NULL) == node))
+	if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
 		return;
 
 	/*
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c99dc0052f49..74356dc0ce29 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
 struct optimistic_spin_node {
 	struct optimistic_spin_node *next, *prev;
 	int locked; /* 1 if lock acquired */
+	int cpu; /* encoded CPU # value */
 };
 
-extern bool osq_lock(struct optimistic_spin_node **lock);
-extern void osq_unlock(struct optimistic_spin_node **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);
 
 #endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33c6760..d9b313906caa 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	INIT_LIST_HEAD(&lock->wait_list);
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	lock->osq = NULL;
+	atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
 #endif
 
 	debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index c40c7d28661d..b77a6230bbf6 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	INIT_LIST_HEAD(&sem->wait_list);
 #ifdef CONFIG_SMP
 	sem->owner = NULL;
-	sem->osq = NULL;
+	atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
 #endif
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:50 -0700
Subject: locking/spinlocks/mcs: Introduce and use init macro and function for
 osq locks

Currently, we initialize the osq lock by directly setting the lock's values. It
would be preferable if we use an init macro to do the initialization like we do
with other locks.

This patch introduces and uses a macro and function for initializing the osq lock.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-4-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/osq_lock.h    | 8 ++++++++
 include/linux/rwsem.h       | 2 +-
 kernel/locking/mutex.c      | 2 +-
 kernel/locking/rwsem-xadd.c | 2 +-
 4 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index b001682bf7cb..90230d5811c5 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -16,4 +16,12 @@ struct optimistic_spin_queue {
 	atomic_t tail;
 };
 
+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+static inline void osq_lock_init(struct optimistic_spin_queue *lock)
+{
+	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+}
+
 #endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9fdcdd03507d..25cd9aa2f3d7 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
 	  LIST_HEAD_INIT((name).wait_list),		\
 	  NULL, /* owner */				\
-	  { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */   \
+	  OSQ_LOCK_UNLOCKED /* osq */			\
 	  __RWSEM_DEP_MAP_INIT(name) }
 #else
 #define __RWSEM_INITIALIZER(name)			\
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d9b313906caa..acca2c1a3c5e 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	INIT_LIST_HEAD(&lock->wait_list);
 	mutex_clear_owner(lock);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-	atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
+	osq_lock_init(&lock->osq);
 #endif
 
 	debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b77a6230bbf6..7190592c2645 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	INIT_LIST_HEAD(&sem->wait_list);
 #ifdef CONFIG_SMP
 	sem->owner = NULL;
-	atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
+	osq_lock_init(&sem->osq);
 #endif
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 13b9a962a2594ee880c5d50d7f70964da1d4fe5a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Jul 2014 14:54:55 +0200
Subject: locking/rwsem: Rename 'activity' to 'count'

There are two definitions of struct rw_semaphore, one in linux/rwsem.h
and one in linux/rwsem-spinlock.h.

For some reason they have different names for the initial field. This
makes it impossible to use C99 named initialization for
__RWSEM_INITIALIZER() -- or we have to duplicate that entire thing
along with the structure definitions.

The simpler patch is renaming the rwsem-spinlock variant to match the
regular rwsem.

This allows us to switch to C99 named initialization.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/n/tip-bmrZolsbGmautmzrerog27io@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem-spinlock.h  |  8 ++++----
 kernel/locking/rwsem-spinlock.c | 28 ++++++++++++++--------------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem-spinlock.h b/include/linux/rwsem-spinlock.h
index d5b13bc07a0b..561e8615528d 100644
--- a/include/linux/rwsem-spinlock.h
+++ b/include/linux/rwsem-spinlock.h
@@ -15,13 +15,13 @@
 #ifdef __KERNEL__
 /*
  * the rw-semaphore definition
- * - if activity is 0 then there are no active readers or writers
- * - if activity is +ve then that is the number of active readers
- * - if activity is -1 then there is one active writer
+ * - if count is 0 then there are no active readers or writers
+ * - if count is +ve then that is the number of active readers
+ * - if count is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
 struct rw_semaphore {
-	__s32			activity;
+	__s32			count;
 	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 9be8a9144978..2c93571162cb 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -26,7 +26,7 @@ int rwsem_is_locked(struct rw_semaphore *sem)
 	unsigned long flags;
 
 	if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-		ret = (sem->activity != 0);
+		ret = (sem->count != 0);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 	}
 	return ret;
@@ -46,7 +46,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	debug_check_no_locks_freed((void *)sem, sizeof(*sem));
 	lockdep_init_map(&sem->dep_map, name, key, 0);
 #endif
-	sem->activity = 0;
+	sem->count = 0;
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
 }
@@ -95,7 +95,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 		waiter = list_entry(next, struct rwsem_waiter, list);
 	} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
 
-	sem->activity += woken;
+	sem->count += woken;
 
  out:
 	return sem;
@@ -126,9 +126,9 @@ void __sched __down_read(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+	if (sem->count >= 0 && list_empty(&sem->wait_list)) {
 		/* granted */
-		sem->activity++;
+		sem->count++;
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
 		goto out;
 	}
@@ -170,9 +170,9 @@ int __down_read_trylock(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity >= 0 && list_empty(&sem->wait_list)) {
+	if (sem->count >= 0 && list_empty(&sem->wait_list)) {
 		/* granted */
-		sem->activity++;
+		sem->count++;
 		ret = 1;
 	}
 
@@ -206,7 +206,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 		 * itself into sleep and waiting for system woke it or someone
 		 * else in the head of the wait list up.
 		 */
-		if (sem->activity == 0)
+		if (sem->count == 0)
 			break;
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 		raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -214,7 +214,7 @@ void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
 		raw_spin_lock_irqsave(&sem->wait_lock, flags);
 	}
 	/* got the lock */
-	sem->activity = -1;
+	sem->count = -1;
 	list_del(&waiter.list);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -235,9 +235,9 @@ int __down_write_trylock(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (sem->activity == 0) {
+	if (sem->count == 0) {
 		/* got the lock */
-		sem->activity = -1;
+		sem->count = -1;
 		ret = 1;
 	}
 
@@ -255,7 +255,7 @@ void __up_read(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	if (--sem->activity == 0 && !list_empty(&sem->wait_list))
+	if (--sem->count == 0 && !list_empty(&sem->wait_list))
 		sem = __rwsem_wake_one_writer(sem);
 
 	raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
@@ -270,7 +270,7 @@ void __up_write(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	sem->activity = 0;
+	sem->count = 0;
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem, 1);
 
@@ -287,7 +287,7 @@ void __downgrade_write(struct rw_semaphore *sem)
 
 	raw_spin_lock_irqsave(&sem->wait_lock, flags);
 
-	sem->activity = 1;
+	sem->count = 1;
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem, 0);
 
-- 
cgit v1.2.3-59-g8ed1b


From ce069fc920e5734558b3d9cbef1ab06cf01ee793 Mon Sep 17 00:00:00 2001
From: Jason Low <jason.low2@hp.com>
Date: Mon, 14 Jul 2014 10:27:52 -0700
Subject: locking/rwsem: Reduce the size of struct rw_semaphore

Recent optimistic spinning additions to rwsem provide significant performance
benefits on many workloads on large machines. The cost of it was increasing
the size of the rwsem structure by up to 128 bits.

However, now that the previous patches in this series bring the overhead of
struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
struct rw_semaphore such that we can reduce the overhead of the rwsem structure
by 64 bits (on 64 bit systems).

The extra overhead required for rwsem optimistic spinning would now be up
to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
rwsem would now be more in line with mutexes.

Signed-off-by: Jason Low <jason.low2@hp.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Norton <scott.norton@hp.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Waiman Long <waiman.long@hp.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Aswin Chandramouleeswaran <aswin@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Chris Mason <clm@fb.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Link: http://lkml.kernel.org/r/1405358872-3732-6-git-send-email-jason.low2@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 25cd9aa2f3d7..716807f0eb2d 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -24,15 +24,15 @@ struct rw_semaphore;
 /* All arch specific implementations share the same struct */
 struct rw_semaphore {
 	long count;
-	raw_spinlock_t wait_lock;
 	struct list_head wait_list;
+	raw_spinlock_t wait_lock;
 #ifdef CONFIG_SMP
+	struct optimistic_spin_queue osq; /* spinner MCS lock */
 	/*
 	 * Write owner. Used as a speculative check to see
 	 * if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
-	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map	dep_map;
@@ -64,21 +64,18 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 #endif
 
 #if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-#define __RWSEM_INITIALIZER(name)			\
-	{ RWSEM_UNLOCKED_VALUE,				\
-	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list),		\
-	  NULL, /* owner */				\
-	  OSQ_LOCK_UNLOCKED /* osq */			\
-	  __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
 #else
-#define __RWSEM_INITIALIZER(name)			\
-	{ RWSEM_UNLOCKED_VALUE,				\
-	  __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),	\
-	  LIST_HEAD_INIT((name).wait_list)		\
-	  __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname)
 #endif
 
+#define __RWSEM_INITIALIZER(name)				\
+	{ .count = RWSEM_UNLOCKED_VALUE,			\
+	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
+	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
+	  __RWSEM_OPT_INIT(name)				\
+	  __RWSEM_DEP_MAP_INIT(name) }
+
 #define DECLARE_RWSEM(name) \
 	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-- 
cgit v1.2.3-59-g8ed1b


From 5db6c6fefb1ca0e81e3bd6dd8998bf51c453d823 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Fri, 11 Jul 2014 14:00:06 -0700
Subject: locking/rwsem: Add CONFIG_RWSEM_SPIN_ON_OWNER

Just like with mutexes (CONFIG_MUTEX_SPIN_ON_OWNER),
encapsulate the dependencies for rwsem optimistic spinning.
No logical changes here as it continues to depend on both
SMP and the XADD algorithm variant.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Acked-by: Jason Low <jason.low2@hp.com>
[ Also make it depend on ARCH_SUPPORTS_ATOMIC_RMW. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1405112406-13052-2-git-send-email-davidlohr@hp.com
Cc: aswin@hp.com
Cc: Chris Mason <clm@fb.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Waiman Long <Waiman.Long@hp.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h       | 6 ++++--
 kernel/Kconfig.locks        | 4 ++++
 kernel/locking/rwsem-xadd.c | 4 ++--
 kernel/locking/rwsem.c      | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 716807f0eb2d..035d3c57fc8a 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -14,7 +14,9 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/atomic.h>
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #include <linux/osq_lock.h>
+#endif
 
 struct rw_semaphore;
 
@@ -26,7 +28,7 @@ struct rw_semaphore {
 	long count;
 	struct list_head wait_list;
 	raw_spinlock_t wait_lock;
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 	/*
 	 * Write owner. Used as a speculative check to see
@@ -63,7 +65,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
-#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 #define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
 #else
 #define __RWSEM_OPT_INIT(lockname)
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 81907941d921..76768ee812b2 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER
 	def_bool y
 	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
 
+config RWSEM_SPIN_ON_OWNER
+       def_bool y
+       depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
+
 config ARCH_USE_QUEUE_RWLOCK
 	bool
 
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 7190592c2645..a2391ac135c8 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -82,7 +82,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	raw_spin_lock_init(&sem->wait_lock);
 	INIT_LIST_HEAD(&sem->wait_list);
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	sem->owner = NULL;
 	osq_lock_init(&sem->osq);
 #endif
@@ -262,7 +262,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 	return false;
 }
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
  * Try to acquire write lock before the writer has been put on wait queue.
  */
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 42f806de49d4..e2d3bc7f03b4 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,7 +12,7 @@
 
 #include <linux/atomic.h>
 
-#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 static inline void rwsem_set_owner(struct rw_semaphore *sem)
 {
 	sem->owner = current;
-- 
cgit v1.2.3-59-g8ed1b


From 2b1987a9f1e6250c962ca13820d3c69817879266 Mon Sep 17 00:00:00 2001
From: Brian W Hart <hartb@linux.vnet.ibm.com>
Date: Fri, 27 Jun 2014 16:09:39 -0500
Subject: cpufreq: make table sentinel macros unsigned to match use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 5eeaf1f18973 (cpufreq: Fix build error on some platforms that
use cpufreq_for_each_*) moved function cpufreq_next_valid() to a public
header.  Warnings are now generated when objects including that header
are built with -Wsign-compare (as an out-of-tree module might be):

.../include/linux/cpufreq.h: In function ‘cpufreq_next_valid’:
.../include/linux/cpufreq.h:519:27: warning: comparison between signed
and unsigned integer expressions [-Wsign-compare]
  while ((*pos)->frequency != CPUFREQ_TABLE_END)
                           ^
.../include/linux/cpufreq.h:520:25: warning: comparison between signed
and unsigned integer expressions [-Wsign-compare]
   if ((*pos)->frequency != CPUFREQ_ENTRY_INVALID)
                         ^

Constants CPUFREQ_ENTRY_INVALID and CPUFREQ_TABLE_END are signed, but
are used with unsigned member 'frequency' of cpufreq_frequency_table.
Update the macro definitions to be explicitly unsigned to match their
use.

This also corrects potentially wrong behavior of clk_rate_table_iter()
if unsigned long is wider than usigned int.

Fixes: 5eeaf1f18973 (cpufreq: Fix build error on some platforms that use cpufreq_for_each_*)
Signed-off-by: Brian W Hart <hartb@linux.vnet.ibm.com>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpufreq.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ec4112d257bc..8f8ae95c6e27 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -482,8 +482,8 @@ extern struct cpufreq_governor cpufreq_gov_conservative;
  *********************************************************************/
 
 /* Special Values of .frequency field */
-#define CPUFREQ_ENTRY_INVALID	~0
-#define CPUFREQ_TABLE_END	~1
+#define CPUFREQ_ENTRY_INVALID	~0u
+#define CPUFREQ_TABLE_END	~1u
 /* Special Values of .flags field */
 #define CPUFREQ_BOOST_FREQ	(1 << 0)
 
-- 
cgit v1.2.3-59-g8ed1b


From 1a112d10f03e83fb3a2fdc4c9165865dec8a3ca6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 23 Jul 2014 09:05:27 -0400
Subject: libata: introduce ata_host->n_tags to avoid oops on SAS controllers

1871ee134b73 ("libata: support the ata host which implements a queue
depth less than 32") directly used ata_port->scsi_host->can_queue from
ata_qc_new() to determine the number of tags supported by the host;
unfortunately, SAS controllers doing SATA don't initialize ->scsi_host
leading to the following oops.

 BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
 IP: [<ffffffff814e0618>] ata_qc_new_init+0x188/0x1b0
 PGD 0
 Oops: 0002 [#1] SMP
 Modules linked in: isci libsas scsi_transport_sas mgag200 drm_kms_helper ttm
 CPU: 1 PID: 518 Comm: udevd Not tainted 3.16.0-rc6+ #62
 Hardware name: Intel Corporation S2600CO/S2600CO, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013
 task: ffff880c1a00b280 ti: ffff88061a000000 task.ti: ffff88061a000000
 RIP: 0010:[<ffffffff814e0618>]  [<ffffffff814e0618>] ata_qc_new_init+0x188/0x1b0
 RSP: 0018:ffff88061a003ae8  EFLAGS: 00010012
 RAX: 0000000000000001 RBX: ffff88000241ca80 RCX: 00000000000000fa
 RDX: 0000000000000020 RSI: 0000000000000020 RDI: ffff8806194aa298
 RBP: ffff88061a003ae8 R08: ffff8806194a8000 R09: 0000000000000000
 R10: 0000000000000000 R11: ffff88000241ca80 R12: ffff88061ad58200
 R13: ffff8806194aa298 R14: ffffffff814e67a0 R15: ffff8806194a8000
 FS:  00007f3ad7fe3840(0000) GS:ffff880627620000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000000000000058 CR3: 000000061a118000 CR4: 00000000001407e0
 Stack:
  ffff88061a003b20 ffffffff814e96e1 ffff88000241ca80 ffff88061ad58200
  ffff8800b6bf6000 ffff880c1c988000 ffff880619903850 ffff88061a003b68
  ffffffffa0056ce1 ffff88061a003b48 0000000013d6e6f8 ffff88000241ca80
 Call Trace:
  [<ffffffff814e96e1>] ata_sas_queuecmd+0xa1/0x430
  [<ffffffffa0056ce1>] sas_queuecommand+0x191/0x220 [libsas]
  [<ffffffff8149afee>] scsi_dispatch_cmd+0x10e/0x300
  [<ffffffff814a3bc5>] scsi_request_fn+0x2f5/0x550
  [<ffffffff81317613>] __blk_run_queue+0x33/0x40
  [<ffffffff8131781a>] queue_unplugged+0x2a/0x90
  [<ffffffff8131ceb4>] blk_flush_plug_list+0x1b4/0x210
  [<ffffffff8131d274>] blk_finish_plug+0x14/0x50
  [<ffffffff8117eaa8>] __do_page_cache_readahead+0x198/0x1f0
  [<ffffffff8117ee21>] force_page_cache_readahead+0x31/0x50
  [<ffffffff8117ee7e>] page_cache_sync_readahead+0x3e/0x50
  [<ffffffff81172ac6>] generic_file_read_iter+0x496/0x5a0
  [<ffffffff81219897>] blkdev_read_iter+0x37/0x40
  [<ffffffff811e307e>] new_sync_read+0x7e/0xb0
  [<ffffffff811e3734>] vfs_read+0x94/0x170
  [<ffffffff811e43c6>] SyS_read+0x46/0xb0
  [<ffffffff811e33d1>] ? SyS_lseek+0x91/0xb0
  [<ffffffff8171ee29>] system_call_fastpath+0x16/0x1b
 Code: 00 00 00 88 50 29 83 7f 08 01 19 d2 83 e2 f0 83 ea 50 88 50 34 c6 81 1d 02 00 00 40 c6 81 17 02 00 00 00 5d c3 66 0f 1f 44 00 00 <89> 14 25 58 00 00 00

Fix it by introducing ata_host->n_tags which is initialized to
ATA_MAX_QUEUE - 1 in ata_host_init() for SAS controllers and set to
scsi_host_template->can_queue in ata_host_register() for !SAS ones.
As SAS hosts are never registered, this will give them the same
ATA_MAX_QUEUE - 1 as before.  Note that we can't use
scsi_host->can_queue directly for SAS hosts anyway as they can go
higher than the libata maximum.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mike Qiu <qiudayu@linux.vnet.ibm.com>
Reported-by: Jesse Brandeburg <jesse.brandeburg@gmail.com>
Reported-by: Peter Hurley <peter@hurleysoftware.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Tested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Fixes: 1871ee134b73 ("libata: support the ata host which implements a queue depth less than 32")
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/ata/libata-core.c | 16 ++++------------
 include/linux/libata.h    |  1 +
 2 files changed, 5 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index d19c37a7abc9..677c0c1b03bd 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4798,9 +4798,8 @@ void swap_buf_le16(u16 *buf, unsigned int buf_words)
 static struct ata_queued_cmd *ata_qc_new(struct ata_port *ap)
 {
 	struct ata_queued_cmd *qc = NULL;
-	unsigned int i, tag, max_queue;
-
-	max_queue = ap->scsi_host->can_queue;
+	unsigned int max_queue = ap->host->n_tags;
+	unsigned int i, tag;
 
 	/* no command while frozen */
 	if (unlikely(ap->pflags & ATA_PFLAG_FROZEN))
@@ -6094,6 +6093,7 @@ void ata_host_init(struct ata_host *host, struct device *dev,
 {
 	spin_lock_init(&host->lock);
 	mutex_init(&host->eh_mutex);
+	host->n_tags = ATA_MAX_QUEUE - 1;
 	host->dev = dev;
 	host->ops = ops;
 }
@@ -6175,15 +6175,7 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
 {
 	int i, rc;
 
-	/*
-	 * The max queue supported by hardware must not be greater than
-	 * ATA_MAX_QUEUE.
-	 */
-	if (sht->can_queue > ATA_MAX_QUEUE) {
-		dev_err(host->dev, "BUG: the hardware max queue is too large\n");
-		WARN_ON(1);
-		return -EINVAL;
-	}
+	host->n_tags = clamp(sht->can_queue, 1, ATA_MAX_QUEUE - 1);
 
 	/* host must have been started */
 	if (!(host->flags & ATA_HOST_STARTED)) {
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 5ab4e3a76721..92abb497ab14 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -593,6 +593,7 @@ struct ata_host {
 	struct device 		*dev;
 	void __iomem * const	*iomap;
 	unsigned int		n_ports;
+	unsigned int		n_tags;			/* nr of NCQ tags */
 	void			*private_data;
 	struct ata_port_operations *ops;
 	unsigned long		flags;
-- 
cgit v1.2.3-59-g8ed1b


From a0f7a756c2f7543585657cdeeefdfcc11b567293 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 23 Jul 2014 14:00:01 -0700
Subject: mm/rmap.c: fix pgoff calculation to handle hugepage correctly

I triggered VM_BUG_ON() in vma_address() when I tried to migrate an
anonymous hugepage with mbind() in the kernel v3.16-rc3.  This is
because pgoff's calculation in rmap_walk_anon() fails to consider
compound_order() only to have an incorrect value.

This patch introduces page_to_pgoff(), which gets the page's offset in
PAGE_CACHE_SIZE.

Kirill pointed out that page cache tree should natively handle
hugepages, and in order to make hugetlbfs fit it, page->index of
hugetlbfs page should be in PAGE_CACHE_SIZE.  This is beyond this patch,
but page_to_pgoff() contains the point to be fixed in a single function.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Naoya Horiguchi <nao.horiguchi@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 12 ++++++++++++
 mm/memory-failure.c     |  4 ++--
 mm/rmap.c               | 10 +++-------
 3 files changed, 17 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0a97b583ee8d..e1474ae18c88 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -398,6 +398,18 @@ static inline struct page *read_mapping_page(struct address_space *mapping,
 	return read_cache_page(mapping, index, filler, data);
 }
 
+/*
+ * Get the offset in PAGE_SIZE.
+ * (TODO: hugepage should have ->index in PAGE_SIZE)
+ */
+static inline pgoff_t page_to_pgoff(struct page *page)
+{
+	if (unlikely(PageHeadHuge(page)))
+		return page->index << compound_order(page);
+	else
+		return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+}
+
 /*
  * Return byte-offset into filesystem object for page.
  */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c6399e328931..7211a73ba14d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	if (av == NULL)	/* Not actually mapped anymore */
 		return;
 
-	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	pgoff = page_to_pgoff(page);
 	read_lock(&tasklist_lock);
 	for_each_process (tsk) {
 		struct anon_vma_chain *vmac;
@@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 	mutex_lock(&mapping->i_mmap_mutex);
 	read_lock(&tasklist_lock);
 	for_each_process(tsk) {
-		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+		pgoff_t pgoff = page_to_pgoff(page);
 		struct task_struct *t = task_early_kill(tsk, force_early);
 
 		if (!t)
diff --git a/mm/rmap.c b/mm/rmap.c
index b7e94ebbd09e..22a4a7699cdb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -517,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 static inline unsigned long
 __vma_address(struct page *page, struct vm_area_struct *vma)
 {
-	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-
-	if (unlikely(is_vm_hugetlb_page(vma)))
-		pgoff = page->index << huge_page_order(page_hstate(page));
-
+	pgoff_t pgoff = page_to_pgoff(page);
 	return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 }
 
@@ -1639,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct anon_vma *anon_vma;
-	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	pgoff_t pgoff = page_to_pgoff(page);
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
 
@@ -1680,7 +1676,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct address_space *mapping = page->mapping;
-	pgoff_t pgoff = page->index << compound_order(page);
+	pgoff_t pgoff = page_to_pgoff(page);
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
-- 
cgit v1.2.3-59-g8ed1b


From 4972a74b888c6b52ca41fae6076786dbbeb746d5 Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Tue, 15 Jul 2014 10:03:34 -0700
Subject: of: Split early_init_dt_scan into two parts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, early_init_dt_scan validates the header, sets the
boot params, and scans for chosen/memory all in one function.
Split this up into two separate functions (validation/setting
boot params in one, scanning in another) to allow for
additional setup between boot params and scanning the memory.

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Tested-by: Andreas Färber <afaerber@suse.de>
[glikely: s/early_init_dt_scan_all/early_init_dt_scan_nodes/]
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 drivers/of/fdt.c       | 18 +++++++++++++++++-
 include/linux/of_fdt.h |  2 ++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index b777d8f46bd5..ecc7a02d868e 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -937,7 +937,7 @@ int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
 }
 #endif
 
-bool __init early_init_dt_scan(void *params)
+bool __init early_init_dt_verify(void *params)
 {
 	if (!params)
 		return false;
@@ -951,6 +951,12 @@ bool __init early_init_dt_scan(void *params)
 		return false;
 	}
 
+	return true;
+}
+
+
+void __init early_init_dt_scan_nodes(void)
+{
 	/* Retrieve various information from the /chosen node */
 	of_scan_flat_dt(early_init_dt_scan_chosen, boot_command_line);
 
@@ -959,7 +965,17 @@ bool __init early_init_dt_scan(void *params)
 
 	/* Setup memory, calling early_init_dt_add_memory_arch */
 	of_scan_flat_dt(early_init_dt_scan_memory, NULL);
+}
+
+bool __init early_init_dt_scan(void *params)
+{
+	bool status;
+
+	status = early_init_dt_verify(params);
+	if (!status)
+		return false;
 
+	early_init_dt_scan_nodes();
 	return true;
 }
 
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 05117899fcb4..ebb2449082fb 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -73,6 +73,8 @@ extern int early_init_dt_scan_root(unsigned long node, const char *uname,
 				   int depth, void *data);
 
 extern bool early_init_dt_scan(void *params);
+extern bool early_init_dt_verify(void *params);
+extern void early_init_dt_scan_nodes(void);
 
 extern const char *of_flat_dt_get_machine_name(void);
 extern const void *of_flat_dt_match_machine(const void *default_match,
-- 
cgit v1.2.3-59-g8ed1b


From 704033cee2e5b3c1c6eaf5bb398e465a9c3667b5 Mon Sep 17 00:00:00 2001
From: Laura Abbott <lauraa@codeaurora.org>
Date: Tue, 15 Jul 2014 10:03:35 -0700
Subject: of: Add memory limiting function for flattened devicetrees
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Buggy bootloaders may pass bogus memory entries in the devicetree.
Add of_fdt_limit_memory to add an upper bound on the number of
entries that can be present in the devicetree.

Signed-off-by: Laura Abbott <lauraa@codeaurora.org>
Tested-by: Andreas Färber <afaerber@suse.de>
Signed-off-by: Grant Likely <grant.likely@linaro.org>
---
 drivers/of/fdt.c       | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/of_fdt.h |  1 +
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index ecc7a02d868e..9aa012e6ea0a 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -26,6 +26,54 @@
 #include <asm/setup.h>  /* for COMMAND_LINE_SIZE */
 #include <asm/page.h>
 
+/*
+ * of_fdt_limit_memory - limit the number of regions in the /memory node
+ * @limit: maximum entries
+ *
+ * Adjust the flattened device tree to have at most 'limit' number of
+ * memory entries in the /memory node. This function may be called
+ * any time after initial_boot_param is set.
+ */
+void of_fdt_limit_memory(int limit)
+{
+	int memory;
+	int len;
+	const void *val;
+	int nr_address_cells = OF_ROOT_NODE_ADDR_CELLS_DEFAULT;
+	int nr_size_cells = OF_ROOT_NODE_SIZE_CELLS_DEFAULT;
+	const uint32_t *addr_prop;
+	const uint32_t *size_prop;
+	int root_offset;
+	int cell_size;
+
+	root_offset = fdt_path_offset(initial_boot_params, "/");
+	if (root_offset < 0)
+		return;
+
+	addr_prop = fdt_getprop(initial_boot_params, root_offset,
+				"#address-cells", NULL);
+	if (addr_prop)
+		nr_address_cells = fdt32_to_cpu(*addr_prop);
+
+	size_prop = fdt_getprop(initial_boot_params, root_offset,
+				"#size-cells", NULL);
+	if (size_prop)
+		nr_size_cells = fdt32_to_cpu(*size_prop);
+
+	cell_size = sizeof(uint32_t)*(nr_address_cells + nr_size_cells);
+
+	memory = fdt_path_offset(initial_boot_params, "/memory");
+	if (memory > 0) {
+		val = fdt_getprop(initial_boot_params, memory, "reg", &len);
+		if (len > limit*cell_size) {
+			len = limit*cell_size;
+			pr_debug("Limiting number of entries to %d\n", limit);
+			fdt_setprop(initial_boot_params, memory, "reg", val,
+					len);
+		}
+	}
+}
+
 /**
  * of_fdt_is_compatible - Return true if given node from the given blob has
  * compat in its compatible list
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index ebb2449082fb..0ff360d5b3b3 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -86,6 +86,7 @@ extern void unflatten_and_copy_device_tree(void);
 extern void early_init_devtree(void *);
 extern void early_get_first_memblock_info(void *, phys_addr_t *);
 extern u64 fdt_translate_address(const void *blob, int node_offset);
+extern void of_fdt_limit_memory(int limit);
 #else /* CONFIG_OF_FLATTREE */
 static inline void early_init_fdt_scan_reserved_mem(void) {}
 static inline const char *of_flat_dt_get_machine_name(void) { return NULL; }
-- 
cgit v1.2.3-59-g8ed1b


From 8f1d26d0e59b9676587c54578f976709b625d6e9 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Date: Wed, 30 Jul 2014 16:08:39 -0700
Subject: kexec: export free_huge_page to VMCOREINFO

PG_head_mask was added into VMCOREINFO to filter huge pages in b3acc56bfe1
("kexec: save PG_head_mask in VMCOREINFO"), but makedumpfile still need
another symbol to filter *hugetlbfs* pages.

If a user hope to filter user pages, makedumpfile tries to exclude them by
checking the condition whether the page is anonymous, but hugetlbfs pages
aren't anonymous while they also be user pages.

We know it's possible to detect them in the same way as PageHuge(),
so we need the start address of free_huge_page():

    int PageHuge(struct page *page)
    {
            if (!PageCompound(page))
                    return 0;

            page = compound_head(page);
            return get_compound_page_dtor(page) == free_huge_page;
    }

For that reason, this patch changes free_huge_page() into public
to export it to VMCOREINFO.

Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 1 +
 kernel/kexec.c          | 2 ++
 mm/hugetlb.c            | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 255cd5cc0754..a23c096b3080 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -80,6 +80,7 @@ int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
 bool is_hugepage_active(struct page *page);
+void free_huge_page(struct page *page);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 369f41a94124..23a088fec3c0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/swap.h>
 #include <linux/syscore_ops.h>
 #include <linux/compiler.h>
+#include <linux/hugetlb.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -1619,6 +1620,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 #endif
 	VMCOREINFO_NUMBER(PG_head_mask);
 	VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
+	VMCOREINFO_SYMBOL(free_huge_page);
 
 	arch_crash_save_vmcoreinfo();
 	update_vmcoreinfo_note();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9221c02ed9e2..7a0a73d2fcff 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -856,7 +856,7 @@ struct hstate *size_to_hstate(unsigned long size)
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
 {
 	/*
 	 * Can't pass hstate in here because it is called from the
-- 
cgit v1.2.3-59-g8ed1b